Merge branch 'develop' into group-bitmap-accessor

2025-01-25 13:49:33 -05:00
parent 77d8942589 a0fcbc9b71
commit d5b82562bd
293 changed files with 70728 additions and 7108 deletions
--- a/.github/workflows/coverity.yml
+++ b/.github/workflows/coverity.yml
@ -67,7 +67,6 @@ jobs:
        -D PKG_MANIFOLD=on \
        -D PKG_MDI=on \
        -D PKG_MGPT=on \
        -D PKG_ML-PACE=on \
        -D PKG_ML-RANN=on \
        -D PKG_MOLFILE=on \
        -D PKG_NETCDF=on \
--- a/.github/workflows/lammps-gui-flatpak.yml
+++ b/.github/workflows/lammps-gui-flatpak.yml
@ -0,0 +1,53 @@
 # GitHub action to build LAMMPS-GUI as a flatpak bundle
 name: "Build LAMMPS-GUI as flatpak bundle"
 on:
  push:
    branches:
      - develop
  workflow_dispatch:
 concurrency:
  group: ${{ github.event_name }}-${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{github.event_name == 'pull_request'}}
 jobs:
  build:
    name: LAMMPS-GUI flatpak build
    if: ${{ github.repository == 'lammps/lammps' }}
    runs-on: ubuntu-latest
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
      with:
        fetch-depth: 2
    - name: Install extra packages
      run: |
        sudo apt-get update
        sudo apt-get install -y ccache \
                                libeigen3-dev \
                                libcurl4-openssl-dev \
                                mold \
                                ninja-build \
                                python3-dev \
                                flatpak \
                                flatpak-builder
    - name: Set up access to flatpak repo
      run: flatpak --user remote-add --if-not-exists flathub https://dl.flathub.org/repo/flathub.flatpakrepo
    - name: Build flatpak
      run: |
        mkdir flatpack-state
        sed -i -e 's/branch:.*/branch: develop/' tools/lammps-gui/org.lammps.lammps-gui.yml
        flatpak-builder --force-clean --verbose --repo=flatpak-repo \
                               --install-deps-from=flathub --state-dir=flatpak-state \
                               --user --ccache --default-branch=${{ github.ref_name }} \
                               flatpak-build tools/lammps-gui/org.lammps.lammps-gui.yml
        flatpak build-bundle --runtime-repo=https://flathub.org/repo/flathub.flatpakrepo \
                               --verbose  flatpak-repo LAMMPS-Linux-x86_64-GUI.flatpak \
                               org.lammps.lammps-gui ${{ github.ref_name }}
        flatpak install -y -v --user LAMMPS-Linux-x86_64-GUI.flatpak
--- a/.github/workflows/style-check.yml
+++ b/.github/workflows/style-check.yml
@ -35,3 +35,4 @@ jobs:
         make check-permissions
         make check-homepage
         make check-errordocs
         make check-fmtlib
--- a/.github/workflows/unittest-arm64.yml
+++ b/.github/workflows/unittest-arm64.yml
@ -0,0 +1,81 @@
 # GitHub action to build LAMMPS on Linux with ARM64 and run standard unit tests
 name: "Unittest for Linux on ARM64"
 on:
  push:
    branches: [develop]
  workflow_dispatch:
 concurrency:
  group: ${{ github.event_name }}-${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{github.event_name == 'pull_request'}}
 jobs:
  build:
    name: Linux ARM64 Unit Test
    if: ${{ github.repository == 'lammps/lammps' }}
    runs-on: ubuntu-22.04-arm
    env:
      CCACHE_DIR: ${{ github.workspace }}/.ccache
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
      with:
        fetch-depth: 2
    - name: Install extra packages
      run: |
        sudo apt-get update
        sudo apt-get install -y ccache \
                                libeigen3-dev \
                                libcurl4-openssl-dev \
                                mold \
                                ninja-build \
                                python3-dev
    - name: Create Build Environment
      run: mkdir build
    - name: Set up ccache
      uses: actions/cache@v4
      with:
        path: ${{ env.CCACHE_DIR }}
        key: linux-unit-ccache-${{ github.sha }}
        restore-keys: linux-unit-ccache-
    - name: Building LAMMPS via CMake
      shell: bash
      run: |
        ccache -z
        python3 -m venv linuxenv
        source linuxenv/bin/activate
        python3 -m pip install numpy
        python3 -m pip install pyyaml
        cmake -S cmake -B build \
              -C cmake/presets/gcc.cmake \
              -C cmake/presets/most.cmake \
              -D CMAKE_CXX_COMPILER_LAUNCHER=ccache \
              -D CMAKE_C_COMPILER_LAUNCHER=ccache \
              -D BUILD_SHARED_LIBS=on \
              -D DOWNLOAD_POTENTIALS=off \
              -D ENABLE_TESTING=on \
              -D MLIAP_ENABLE_ACE=on \
              -D MLIAP_ENABLE_PYTHON=off \
              -D PKG_MANIFOLD=on \
              -D PKG_ML-PACE=on \
              -D PKG_ML-RANN=on \
              -D PKG_RHEO=on \
              -D PKG_PTM=on \
              -D PKG_PYTHON=on \
              -D PKG_QTB=on \
              -D PKG_SMTBQ=on \
              -G Ninja
        cmake --build build
        ccache -s
    - name: Run Tests
      working-directory: build
      shell: bash
      run: ctest -V -LE unstable
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -605,6 +605,16 @@ foreach(PKG_WITH_INCL KSPACE PYTHON ML-IAP VORONOI COLVARS ML-HDNNP MDI MOLFILE
  endif()
 endforeach()
 # settings for misc packages and styles
 if(PKG_MISC)
  option(LAMMPS_ASYNC_IMD "Asynchronous IMD processing" OFF)
  mark_as_advanced(LAMMPS_ASYNC_IMD)
  if(LAMMPS_ASYNC_IMD)
    target_compile_definitions(lammps PRIVATE -DLAMMPS_ASYNC_IMD)
    message(STATUS "Using IMD in asynchronous mode")
  endif()
 endif()
 # optionally enable building script wrappers using swig
 option(WITH_SWIG "Build scripting language wrappers with SWIG" OFF)
 if(WITH_SWIG)
--- a/cmake/Modules/Packages/INTEL.cmake
+++ b/cmake/Modules/Packages/INTEL.cmake
@ -72,6 +72,10 @@ if(INTEL_ARCH STREQUAL "KNL")
  if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
    message(FATAL_ERROR "Must use Intel compiler with INTEL for KNL architecture")
  endif()
  message(WARNING, "Support for Intel Xeon Phi accelerators and Knight's Landing CPUs "
          "will be removed from LAMMPS in Summer 2025 due to lack of available machines "
          "in labs and HPC centers and removed support in recent compilers "
          "Please contact developers@lammps.org if you have any concerns about this step.")
  set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} -xHost -qopenmp -qoffload")
  set(MIC_OPTIONS "-qoffload-option,mic,compiler,\"-fp-model fast=2 -mGLOB_default_function_attrs=\\\"gather_scatter_loop_unroll=4\\\"\"")
  target_compile_options(lammps PRIVATE -xMIC-AVX512 -qoffload -fno-alias -ansi-alias -restrict -qoverride-limits ${MIC_OPTIONS})
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@ -117,7 +117,6 @@ set(KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/kokkos.cpp
                       ${KOKKOS_PKG_SOURCES_DIR}/atom_vec_kokkos.cpp
                       ${KOKKOS_PKG_SOURCES_DIR}/comm_kokkos.cpp
                       ${KOKKOS_PKG_SOURCES_DIR}/comm_tiled_kokkos.cpp
                       ${KOKKOS_PKG_SOURCES_DIR}/group_kokkos.cpp
                       ${KOKKOS_PKG_SOURCES_DIR}/min_kokkos.cpp
                       ${KOKKOS_PKG_SOURCES_DIR}/min_linesearch_kokkos.cpp
                       ${KOKKOS_PKG_SOURCES_DIR}/neighbor_kokkos.cpp
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@ -48,6 +48,7 @@ This is the list of packages that may require additional steps.
   * :ref:`LEPTON <lepton>`
   * :ref:`MACHDYN <machdyn>`
   * :ref:`MDI <mdi>`
   * :ref:`MISC <misc>`
   * :ref:`ML-HDNNP <ml-hdnnp>`
   * :ref:`ML-IAP <mliap>`
   * :ref:`ML-PACE <ml-pace>`
@ -2031,7 +2032,7 @@ TBB and MKL.
 .. _mdi:
 MDI package
-----------------------------
+-----------
 .. tabs::
@ -2058,6 +2059,37 @@ MDI package
 ----------
 .. _misc:
 MISC package
 ------------
 The :doc:`fix imd <fix_imd>` style in this package can be run either
 synchronously (communication with IMD clients is done in the main
 process) or asynchronously (the fix spawns a separate thread that can
 communicate with IMD clients concurrently to the LAMMPS execution).
 .. tabs::
   .. tab:: CMake build
      .. code-block:: bash
         -D LAMMPS_ASYNC_IMD=value  # Run IMD server asynchronously
                                    # value = no (default) or yes
   .. tab:: Traditional make
      To enable asynchronous mode the ``-DLAMMPS_ASYNC_IMD`` define
      needs to be added to the ``LMP_INC`` variable in the
      ``Makefile.machine`` you are using.  For example:
      .. code-block:: make
         LMP_INC = -DLAMMPS_ASYNC_IMD -DLAMMPS_MEMALIGN=64
 ----------
 .. _molfile:
 MOLFILE package
--- a/doc/src/Build_package.rst
+++ b/doc/src/Build_package.rst
@ -49,6 +49,7 @@ packages:
   * :ref:`LEPTON <lepton>`
   * :ref:`MACHDYN <machdyn>`
   * :ref:`MDI <mdi>`
   * :ref:`MISC <misc>`
   * :ref:`ML-HDNNP <ml-hdnnp>`
   * :ref:`ML-IAP <mliap>`
   * :ref:`ML-PACE <ml-pace>`
--- a/doc/src/Commands_compute.rst
+++ b/doc/src/Commands_compute.rst
@ -58,6 +58,7 @@ KOKKOS, o = OPENMP, t = OPT.
   * :doc:`fep/ta <compute_fep_ta>`
   * :doc:`force/tally <compute_tally>`
   * :doc:`fragment/atom <compute_cluster_atom>`
   * :doc:`gaussian/grid/local (k) <compute_gaussian_grid_local>`
   * :doc:`global/atom <compute_global_atom>`
   * :doc:`group/group <compute_group_group>`
   * :doc:`gyration <compute_gyration>`
@ -140,8 +141,8 @@ KOKKOS, o = OPENMP, t = OPT.
   * :doc:`smd/vol <compute_smd_vol>`
   * :doc:`snap <compute_sna_atom>`
   * :doc:`sna/atom <compute_sna_atom>`
-   * :doc:`sna/grid <compute_sna_atom>`
+   * :doc:`sna/grid (k) <compute_sna_atom>`
-   * :doc:`sna/grid/local <compute_sna_atom>`
+   * :doc:`sna/grid/local (k) <compute_sna_atom>`
   * :doc:`snad/atom <compute_sna_atom>`
   * :doc:`snav/atom <compute_sna_atom>`
   * :doc:`sph/e/atom <compute_sph_e_atom>`
--- a/doc/src/Commands_fix.rst
+++ b/doc/src/Commands_fix.rst
@ -58,6 +58,7 @@ OPT.
   * :doc:`dt/reset (k) <fix_dt_reset>`
   * :doc:`edpd/source <fix_dpd_source>`
   * :doc:`efield (k) <fix_efield>`
   * :doc:`efield/lepton <fix_efield_lepton>`
   * :doc:`efield/tip4p <fix_efield>`
   * :doc:`ehex <fix_ehex>`
   * :doc:`electrode/conp (i) <fix_electrode>`
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@ -80,6 +80,7 @@ OPT.
   * :doc:`coul/tt <pair_coul_tt>`
   * :doc:`coul/wolf (ko) <pair_coul>`
   * :doc:`coul/wolf/cs <pair_cs>`
   * :doc:`dispersion/d3 <pair_dispersion_d3>`
   * :doc:`dpd (giko) <pair_dpd>`
   * :doc:`dpd/coul/slater/long (g) <pair_dpd_coul_slater_long>`
   * :doc:`dpd/ext (ko) <pair_dpd_ext>`
--- a/doc/src/Commands_removed.rst
+++ b/doc/src/Commands_removed.rst
@ -1,6 +1,10 @@
 Removed commands and packages
 =============================
 .. contents::
 ------
 This page lists LAMMPS commands and packages that have been removed from
 the distribution and provides suggestions for alternatives or
 replacements.  LAMMPS has special dummy styles implemented, that will
@ -8,47 +12,60 @@ stop LAMMPS and print a suitable error message in most cases, when a
 style/command is used that has been removed or will replace the command
 with the direct alternative (if available) and print a warning.
-restart2data tool
+LAMMPS shell
-----------------
+------------
-.. versionchanged:: 23Nov2013
+.. versionchanged:: 29Aug2024
-The functionality of the restart2data tool has been folded into the
+The LAMMPS shell has been removed from the LAMMPS distribution. Users
-LAMMPS executable directly instead of having a separate tool.  A
+are encouraged to use the :ref:`LAMMPS-GUI <lammps_gui>` tool instead.
 combination of the commands :doc:`read_restart <read_restart>` and
 :doc:`write_data <write_data>` can be used to the same effect.  For
 added convenience this conversion can also be triggered by
 :doc:`command-line flags <Run_options>`
-Fix ave/spatial and fix ave/spatial/sphere
+i-PI tool
------------------------------------------
+---------
-.. deprecated:: 11Dec2015
+.. versionchanged:: 27Jun2024
-The fixes ave/spatial and ave/spatial/sphere have been removed from LAMMPS
+The i-PI tool has been removed from the LAMMPS distribution.  Instead,
-since they were superseded by the more general and extensible "chunk
+instructions to install i-PI from PyPI via pip are provided.
 infrastructure".  Here the system is partitioned in one of many possible
 ways through the :doc:`compute chunk/atom <compute_chunk_atom>` command
 and then averaging is done using :doc:`fix ave/chunk <fix_ave_chunk>`.
 Please refer to the :doc:`chunk HOWTO <Howto_chunk>` section for an overview.
-Box command
+USER-REAXC package
-----------
+------------------
-.. deprecated:: 22Dec2022
+.. deprecated:: 7Feb2024
-The *box* command has been removed and the LAMMPS code changed so it won't
+The USER-REAXC package has been renamed to :ref:`REAXFF <PKG-REAXFF>`.
-be needed.  If present, LAMMPS will ignore the command and print a warning.
+In the process also the pair style and related fixes were renamed to use
 the "reaxff" string instead of "reax/c". For a while LAMMPS was maintaining
 backward compatibility by providing aliases for the styles.  These have
 been removed, so using "reaxff" is now *required*.
-Reset_ids, reset_atom_ids, reset_mol_ids commands
+MPIIO package
-------------------------------------------------
+-------------
-.. deprecated:: 22Dec2022
+.. deprecated:: 21Nov2023
-The *reset_ids*, *reset_atom_ids*, and *reset_mol_ids* commands have
+The MPIIO package has been removed from LAMMPS since it was unmaintained
-been folded into the :doc:`reset_atoms <reset_atoms>` command.  If
+for many years and thus not updated to incorporate required changes that
-present, LAMMPS will replace the commands accordingly and print a
+had been applied to the corresponding non-MPIIO commands. As a
-warning.
+consequence the MPIIO commands had become unreliable and sometimes
 crashing LAMMPS or corrupting data.  Similar functionality is available
 through the :ref:`ADIOS package <PKG-ADIOS>` and the :ref:`NETCDF
 package <PKG-NETCDF>`.  Also, the :doc:`dump_modify nfile or dump_modify
 fileper <dump_modify>` keywords may be used for an efficient way of
 writing out dump files when running on large numbers of processors.
 Similarly, the "nfile" and "fileper" keywords exist for restarts:
 see :doc:`restart <restart>`, :doc:`read_restart <read_restart>`,
 :doc:`write_restart <write_restart>`.
 MSCG package
 ------------
 .. deprecated:: 21Nov2023
 The MSCG package has been removed from LAMMPS since it was unmaintained
 for many years and instead superseded by the `OpenMSCG software
 <https://software.rcc.uchicago.edu/mscg/>`_ of the Voth group at the
 University of Chicago, which can be used independent from LAMMPS.
 LATTE package
 -------------
@ -64,18 +81,6 @@ packages, including LATTE.  See the ``examples/QUANTUM`` dir and the
 with LATTE as a plugin library (similar to the way fix latte worked), as
 well as on a different set of MPI processors.
 MEAM package
 ------------
 The MEAM package in Fortran has been replaced by a C++ implementation.
 The code in the :ref:`MEAM package <PKG-MEAM>` is a translation of the
 Fortran code of MEAM into C++, which removes several restrictions
 (e.g. there can be multiple instances in hybrid pair styles) and allows
 for some optimizations leading to better performance.  The pair style
 :doc:`meam <pair_meam>` has the exact same syntax.  For a transition
 period the C++ version of MEAM was called USER-MEAMC so it could
 coexist with the Fortran version.
 Minimize style fire/old
 -----------------------
@ -97,38 +102,38 @@ The same functionality is available through
 :doc:`bond style mesocnt <bond_mesocnt>` and
 :doc:`angle style mesocnt <angle_mesocnt>`.
-MPIIO package
+Box command
-------------
+-----------
-.. deprecated:: 21Nov2023
+.. deprecated:: 22Dec2022
-The MPIIO package has been removed from LAMMPS since it was unmaintained
+The *box* command has been removed and the LAMMPS code changed so it won't
-for many years and thus not updated to incorporate required changes that
+be needed.  If present, LAMMPS will ignore the command and print a warning.
 had been applied to the corresponding non-MPIIO commands. As a
 consequence the MPIIO commands had become unreliable and sometimes
 crashing LAMMPS or corrupting data.  Similar functionality is available
 through the :ref:`ADIOS package <PKG-ADIOS>` and the :ref:`NETCDF
 package <PKG-NETCDF>`.  Also, the :doc:`dump_modify nfile or dump_modify
 fileper <dump_modify>` keywords may be used for an efficient way of
 writing out dump files when running on large numbers of processors.
 Similarly, the "nfile" and "fileper" keywords exist for restarts:
 see :doc:`restart <restart>`, :doc:`read_restart <read_restart>`,
 :doc:`write_restart <write_restart>`.
 Reset_ids, reset_atom_ids, reset_mol_ids commands
 -------------------------------------------------
-MSCG package
+.. deprecated:: 22Dec2022
 ------------
-.. deprecated:: 21Nov2023
+The *reset_ids*, *reset_atom_ids*, and *reset_mol_ids* commands have
 been folded into the :doc:`reset_atoms <reset_atoms>` command.  If
 present, LAMMPS will replace the commands accordingly and print a
 warning.
-The MSCG package has been removed from LAMMPS since it was unmaintained
+MESSAGE package
-for many years and instead superseded by the `OpenMSCG software
+---------------
-<https://software.rcc.uchicago.edu/mscg/>`_ of the Voth group at the
+
-University of Chicago, which can be used independent from LAMMPS.
+.. deprecated:: 4May2022
 The MESSAGE package has been removed since it was superseded by the
 :ref:`MDI package <PKG-MDI>`. MDI implements the same functionality
 and in a more general way with direct support for more applications.
 REAX package
 ------------
 .. deprecated:: 4Jan2019
 The REAX package has been removed since it was superseded by the
 :ref:`REAXFF package <PKG-REAXFF>`.  The REAXFF package has been tested
 to yield equivalent results to the REAX package, offers better
@ -138,20 +143,25 @@ syntax compatible with the removed reax pair style, so input files will
 have to be adapted.  The REAXFF package was originally called
 USER-REAXC.
-USER-REAXC package
+MEAM package
------------------
+------------
-.. deprecated:: 7Feb2024
+.. deprecated:: 4Jan2019
-The USER-REAXC package has been renamed to :ref:`REAXFF <PKG-REAXFF>`.
+The MEAM package in Fortran has been replaced by a C++ implementation.
-In the process also the pair style and related fixes were renamed to use
+The code in the :ref:`MEAM package <PKG-MEAM>` is a translation of the
-the "reaxff" string instead of "reax/c". For a while LAMMPS was maintaining
+Fortran code of MEAM into C++, which removes several restrictions
-backward compatibility by providing aliases for the styles.  These have
+(e.g. there can be multiple instances in hybrid pair styles) and allows
-been removed, so using "reaxff" is now *required*.
+for some optimizations leading to better performance.  The pair style
 :doc:`meam <pair_meam>` has the exact same syntax.  For a transition
 period the C++ version of MEAM was called USER-MEAMC so it could
 coexist with the Fortran version.
 USER-CUDA package
 -----------------
 .. deprecated:: 31May2016
 The USER-CUDA package had been removed, since it had been unmaintained
 for a long time and had known bugs and problems.  Significant parts of
 the design were transferred to the
@ -160,19 +170,27 @@ performance characteristics on NVIDIA GPUs. Both, the KOKKOS
 and the :ref:`GPU package <PKG-GPU>` are maintained
 and allow running LAMMPS with GPU acceleration.
-i-PI tool
+Fix ave/spatial and fix ave/spatial/sphere
---------
+------------------------------------------
-.. versionchanged:: 27Jun2024
+.. deprecated:: 11Dec2015
-The i-PI tool has been removed from the LAMMPS distribution.  Instead,
+The fixes ave/spatial and ave/spatial/sphere have been removed from LAMMPS
-instructions to install i-PI from PyPI via pip are provided.
+since they were superseded by the more general and extensible "chunk
 infrastructure".  Here the system is partitioned in one of many possible
 ways through the :doc:`compute chunk/atom <compute_chunk_atom>` command
 and then averaging is done using :doc:`fix ave/chunk <fix_ave_chunk>`.
 Please refer to the :doc:`chunk HOWTO <Howto_chunk>` section for an overview.
-LAMMPS shell
+restart2data tool
------------
+-----------------
-.. versionchanged:: 29Aug2024
+.. deprecated:: 23Nov2013
-The LAMMPS shell has been removed from the LAMMPS distribution. Users
+The functionality of the restart2data tool has been folded into the
-are encouraged to use the :ref:`LAMMPS-GUI <lammps_gui>` tool instead.
+LAMMPS executable directly instead of having a separate tool.  A
 combination of the commands :doc:`read_restart <read_restart>` and
 :doc:`write_data <write_data>` can be used to the same effect.  For
 added convenience this conversion can also be triggered by
 :doc:`command-line flags <Run_options>`
--- a/doc/src/Developer_code_design.rst
+++ b/doc/src/Developer_code_design.rst
@ -300,18 +300,19 @@ Formatting with the {fmt} library
 The LAMMPS source code includes a copy of the `{fmt} library
 <https://fmt.dev>`_, which is preferred over formatting with the
-"printf()" family of functions.  The primary reason is that it allows
+"printf()" family of functions.  The primary reason is that it allows a
-a typesafe default format for any type of supported data.  This is
+typesafe default format for any type of supported data.  This is
 particularly useful for formatting integers of a given size (32-bit or
-64-bit) which may require different format strings depending on
+64-bit) which may require different format strings depending on compile
-compile time settings or compilers/operating systems.  Furthermore,
+time settings or compilers/operating systems.  Furthermore, {fmt} gives
-{fmt} gives better performance, has more functionality, a familiar
+better performance, has more functionality, a familiar formatting syntax
-formatting syntax that has similarities to ``format()`` in Python, and
+that has similarities to ``format()`` in Python, and provides a facility
-provides a facility that can be used to integrate format strings and a
+that can be used to integrate format strings and a variable number of
-variable number of arguments into custom functions in a much simpler
+arguments into custom functions in a much simpler way than the varargs
-way than the varargs mechanism of the C library.  Finally, {fmt} has
+mechanism of the C library.  Finally, {fmt} has been included into the
-been included into the C++20 language standard, so changes to adopt it
+C++20 language standard as ``std::format()``, so changes to adopt it are
-are future-proof.
+future-proof, for as long as they are not using any extensions that are
 not (yet) included into C++.
 Formatted strings are frequently created by calling the
 ``fmt::format()`` function, which will return a string as a
@ -319,11 +320,13 @@ Formatted strings are frequently created by calling the
 ``printf()``, the {fmt} library uses ``{}`` to embed format descriptors.
 In the simplest case, no additional characters are needed, as {fmt} will
 choose the default format based on the data type of the argument.
-Otherwise, the ``fmt::print()`` function may be used instead of
+Otherwise, the :cpp:func:`utils::print() <LAMMPS_NS::utils::print>`
-``printf()`` or ``fprintf()``.  In addition, several LAMMPS output
+function may be used instead of ``printf()`` or ``fprintf()``.  In
-functions, that originally accepted a single string as argument have
+addition, several LAMMPS output functions, that originally accepted a
-been overloaded to accept a format string with optional arguments as
+single string as argument have been overloaded to accept a format string
-well (e.g., ``Error::all()``, ``Error::one()``, ``utils::logmesg()``).
+with optional arguments as well (e.g., ``Error::all()``,
 ``Error::one()``, :cpp:func:`utils::logmesg()
 <LAMMPS_NS::utils::logmesg>`).
 Summary of the {fmt} format syntax
 ==================================
--- a/doc/src/Developer_notes.rst
+++ b/doc/src/Developer_notes.rst
@ -7,13 +7,7 @@ typically document what a variable stores, what a small section of
 code does, or what a function does and its input/outputs.  The topics
 on this page are intended to document code functionality at a higher level.
-Available topics are:
+.. contents::
 - `Reading and parsing of text and text files`_
 - `Requesting and accessing neighbor lists`_
 - `Choosing between a custom atom style, fix property/atom, and fix STORE/ATOM`_
 - `Fix contributions to instantaneous energy, virial, and cumulative energy`_
 - `KSpace PPPM FFT grids`_
 ----
@ -218,6 +212,146 @@ command:
   neighbor->add_request(this, "delete_atoms", NeighConst::REQ_FULL);
 Errors, warnings, and informational messages
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 LAMMPS has specialized functionality to handle errors (which should
 terminate LAMMPS), warning messages (which should indicate possible
 problems *without* terminating LAMMPS), and informational text for
 messages about the progress and chosen settings.  We *strongly*
 encourage using these facilities and to *stay away* from using
 ``printf()`` or ``fprintf()`` or ``std::cout`` or ``std::cerr`` and
 calling ``MPI_Abort()`` or ``exit()`` directly.  Warnings and
 informational messages should be printed only on MPI rank 0 to avoid
 flooding the output when running in parallel with many MPI processes.
 **Errors**
 When LAMMPS encounters an error, for example a syntax error in the
 input, then a suitable error message should be printed giving a brief,
 one line remark about the reason and then call either ``Error::all()``
 or ``Error::one()``.  ``Error::all()`` must be called when the failing
 code path is executed by *all* MPI processes and the error condition
 will appear for *all* MPI processes the same.  If desired, each MPI
 process may set a flag to either 0 or 1 and then MPI_Allreduce()
 searching for the maximum can be used to determine if there was an error
 on *any* of the MPI processes and make this information available to
 *all*.  ``Error::one()`` in contrast needs to be called when only one or
 a few MPI processes execute the code path or can have the error
 condition.  ``Error::all()`` is generally the preferred option.
 Calling these functions does not abort LAMMPS directly, but rather
 throws either a ``LAMMPSException`` (from ``Error::all()``) or a
 ``LAMMPSAbortException`` (from ``Error::one()``).  These exceptions are
 caught by the LAMMPS ``main()`` program and then handled accordingly.
 The reason for this approach is to support applications, especially
 graphical applications like :ref:`LAMMPS-GUI <lammps_gui>`, that are
 linked to the LAMMPS library and have a mechanism to avoid that an error
 in LAMMPS terminates the application. By catching the exceptions, the
 application can delete the failing LAMMPS class instance and create a
 new one to try again.  In a similar fashion, the :doc:`LAMMPS Python
 module <Python_module>` checks for this and then re-throws corresponding
 Python exception, which in turn can be caught by the calling Python
 code.
 There are multiple "signatures" that can be called:
 - ``Error::all(FLERR, "Error message")``: this will abort LAMMPS with
  the error message "Error message", followed by the last line of input
  that was read and processed before the error condition happened.
 - ``Error::all(FLERR, Error::NOLASTLINE, "Error message")``: this is the
  same as before but without the last line of input.  This is preferred
  for errors that would happen *during* a :doc:`run <run>` or
  :doc:`minimization <minimize>`, since showing the "run" or "minimize"
  command would be the last line, but is unrelated to the error.
 - ``Error::all(FLERR, idx, "Error message")``: this is for argument
  parsing where "idx" is the index (starting at 0) of the argument for a
  LAMMPS command that is causing the failure (use -1 for the command
  itself).  The output may also include the last input line *before* and
  *after*, if they differ due to substituting variables.  A textual
  indicator is pointing to the specific word that failed.  Using the
  constant ``Error::NOPOINTER`` in place of the *idx* argument will
  suppress the marker and then the behavior is like the *idx* argument
  is not provided.
 FLERR is a macro containing the filename and line where the Error class
 is called and that information is appended to the error message.  This
 allows to quickly find the relevant source code causing the error.  For
 all three signatures, the single string "Error message" may be replaced
 with a format string using '{}' placeholders and followed by a variable
 number of arguments, one for each placeholder. This format string and
 the arguments are then handed for formatting to the `{fmt} library
 <https://fmt.dev>`_ (which is bundled with LAMMPS) and thus allow
 processing similar to the "format()" functionality in Python.
 .. note::
   For commands like :doc:`fix ave/time <fix_ave_time>` that accept
   wildcard arguments, the :cpp:func:`utils::expand_args` function
   may be passed as an optional argument where the function will provide
   a map to the original arguments from the expanded argument indices.
 For complex errors, that can have multiple causes and which cannot be
 explained in a single line, you can append to the error message, the
 string created by :cpp:func:`utils::errorurl`, which then provides a
 URL pointing to a paragraph of the :doc:`Errors_details` that
 corresponds to the number provided. Example:
 .. code-block:: c++
   error->all(FLERR, "Unknown identifier in data file: {}{}", keyword, utils::errorurl(1));
 This will output something like this:
 .. parsed-literal::
   ERROR: Unknown identifier in data file: Massess
   For more information see https://docs.lammps.org/err0001 (src/read_data.cpp:1482)
   Last input line: read_data       data.peptide
 Where the URL points to the first paragraph with explanations on
 the :doc:`Errors_details` page in the manual.
 **Warnings**
 To print warnings, the ``Errors::warning()`` function should be used.
 It also requires the FLERR macros as first argument to easily identify
 the location of the warning in the source code.  Same as with the error
 functions above, the function has two variants: one just taking a single
 string as final argument and a second that uses the `{fmt} library
 <https://fmt.dev>`_ to make it similar to, say, ``fprintf()``.  One
 motivation to use this function is that it will output warnings with
 always the same capitalization of the leading "WARNING" string.  A
 second is that it has a built in rate limiter.  After a given number (by
 default 100), that can be set via the :doc:`thermo_modify command
 <thermo_modify>` no more warnings are printed.  Also, warnings are
 written consistently to both screen and logfile or not, depending on the
 settings for :ref:`screen <screen>` or :doc:`logfile <log>` output.
 .. note::
   Unlike ``Error::all()``, the warning function will produce output on
   *every* MPI process, so it typically would be prefixed with an if
   statement testing for ``comm->me == 0``, i.e. limiting output to MPI
   rank 0.
 **Informational messages**
 Finally, for informational message LAMMPS has the
 :cpp:func:`utils::logmesg() convenience function
 <LAMMPS_NS::utils::logmesg>`.  It also uses the `{fmt} library
 <https://fmt.dev>`_ to support using a format string followed by a
 matching number of arguments.  It will output the resulting formatted
 text to both, the screen and the logfile and will honor the
 corresponding settings about whether this output is active and to which
 file it should be send.  Same as for ``Error::warning()``, it would
 produce output for every MPI process and thus should usually be called
 only on MPI rank 0 to avoid flooding the output when running with many
 parallel processes.
 Choosing between a custom atom style, fix property/atom, and fix STORE/ATOM
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
--- a/doc/src/Developer_utils.rst
+++ b/doc/src/Developer_utils.rst
@ -133,6 +133,9 @@ and parsing files or arguments.
 .. doxygenfunction:: trim_comment
   :project: progguide
 .. doxygenfunction:: strcompress
   :project: progguide
 .. doxygenfunction:: strip_style_suffix
   :project: progguide
@ -166,6 +169,9 @@ and parsing files or arguments.
 .. doxygenfunction:: split_lines
   :project: progguide
 .. doxygenfunction:: strsame
   :project: progguide
 .. doxygenfunction:: strmatch
   :project: progguide
@ -232,12 +238,21 @@ Convenience functions
 .. doxygenfunction:: logmesg(LAMMPS *lmp, const std::string &mesg)
   :project: progguide
 .. doxygenfunction:: print(FILE *fp, const std::string &format, Args&&... args)
   :project: progguide
 .. doxygenfunction:: print(FILE *fp, const std::string &mesg)
   :project: progguide
 .. doxygenfunction:: errorurl
   :project: progguide
 .. doxygenfunction:: missing_cmd_args
   :project: progguide
 .. doxygenfunction:: point_to_error
   :project: progguide
 .. doxygenfunction:: flush_buffers(LAMMPS *lmp)
   :project: progguide
--- a/doc/src/Developer_write_fix.rst
+++ b/doc/src/Developer_write_fix.rst
@ -96,8 +96,8 @@ Here the we specify which methods of the fix should be called during
     MPI_Allreduce(localAvgVel, globalAvgVel, 4, MPI_DOUBLE, MPI_SUM, world);
     scale3(1.0 / globalAvgVel[3], globalAvgVel);
     if ((comm->me == 0) && screen) {
-       fmt::print(screen,"{}, {}, {}\n",
+       utils::print(screen, "{}, {}, {}\n",
-                  globalAvgVel[0], globalAvgVel[1], globalAvgVel[2]);
+                    globalAvgVel[0], globalAvgVel[1], globalAvgVel[2]);
     }
   }
--- a/doc/src/Fortran.rst
+++ b/doc/src/Fortran.rst
@ -321,6 +321,8 @@ of the contents of the :f:mod:`LIBLAMMPS` Fortran interface to LAMMPS.
   :ftype set_string_variable: subroutine
   :f set_internal_variable: :f:subr:`set_internal_variable`
   :ftype set_internal_variable: subroutine
   :f eval: :f:func:`eval`
   :ftype eval: function
   :f gather_atoms: :f:subr:`gather_atoms`
   :ftype gather_atoms: subroutine
   :f gather_atoms_concat: :f:subr:`gather_atoms_concat`
--- a/doc/src/Howto_lammps_gui.rst
+++ b/doc/src/Howto_lammps_gui.rst
@ -64,13 +64,18 @@ simple LAMMPS simulations.  It is very suitable for tutorials on LAMMPS
 since you only need to learn how to use a single program for most tasks
 and thus time can be saved and people can focus on learning LAMMPS.
 The tutorials at https://lammpstutorials.github.io/ are specifically
-updated for use with LAMMPS-GUI.
+updated for use with LAMMPS-GUI and can their tutorial materials can
 be downloaded and loaded directly from the GUI.
 Another design goal is to keep the barrier low when replacing part of
 the functionality of LAMMPS-GUI with external tools.  That said, LAMMPS-GUI
 has some unique functionality that is not found elsewhere:
 - auto-adapting to features available in the integrated LAMMPS library
 - auto-completion for LAMMPS commands and options
 - context-sensitive online help
 - start and stop of simulations via mouse or keyboard
 - monitoring of simulation progress
 - interactive visualization using the :doc:`dump image <dump_image>`
  command with the option to copy-paste the resulting settings
 - automatic slide show generation from dump image out at runtime
--- a/doc/src/Library_execute.rst
+++ b/doc/src/Library_execute.rst
@ -7,6 +7,7 @@ This section documents the following functions:
 - :cpp:func:`lammps_command`
 - :cpp:func:`lammps_commands_list`
 - :cpp:func:`lammps_commands_string`
 - :cpp:func:`lammps_expand`
 --------------------
@ -79,3 +80,8 @@ Below is a short example using some of these functions.
 .. doxygenfunction:: lammps_commands_string
   :project: progguide
 -----------------------
 .. doxygenfunction:: lammps_expand
   :project: progguide
--- a/doc/src/Library_objects.rst
+++ b/doc/src/Library_objects.rst
@ -1,5 +1,5 @@
-Compute, fixes, variables
+Computes, fixes, variables
-=========================
+==========================
 This section documents accessing or modifying data stored by computes,
 fixes, or variables in LAMMPS using the following functions:
@ -12,6 +12,7 @@ fixes, or variables in LAMMPS using the following functions:
 - :cpp:func:`lammps_set_string_variable`
 - :cpp:func:`lammps_set_internal_variable`
 - :cpp:func:`lammps_variable_info`
 - :cpp:func:`lammps_eval`
 -----------------------
@ -55,6 +56,11 @@ fixes, or variables in LAMMPS using the following functions:
 -----------------------
 .. doxygenfunction:: lammps_eval
   :project: progguide
 -----------------------
 .. doxygenenum:: _LMP_DATATYPE_CONST
 .. doxygenenum:: _LMP_STYLE_CONST
--- a/doc/src/Packages_details.rst
+++ b/doc/src/Packages_details.rst
@ -994,6 +994,7 @@ Additional pair styles that are less commonly used.
 * ``src/EXTRA-PAIR``: filenames -> commands
 * :doc:`pair_style <pair_style>`
 * ``examples/PACKAGES/dispersion``
 ----------
--- a/doc/src/Run_output.rst
+++ b/doc/src/Run_output.rst
@ -178,3 +178,64 @@ with and without the communication and a Gflop rate is computed.  The
 3d rate is with communication; the 1d rate is without (just the 1d
 FFTs).  Thus you can estimate what fraction of your FFT time was spent
 in communication, roughly 75% in the example above.
 Error message output
 ====================
 Depending on the error function arguments when it is called in the
 source code, there will be one to four lines of error output.
 A single line
 ^^^^^^^^^^^^^
 The line starts with "ERROR: ", followed by the error message and
 information about the location in the source where the error function
 was called in parenthesis on the right (here: line 131 of the file
 src/fix_print.cpp). Example:
 .. parsed-literal::
   ERROR: Fix print timestep variable nevery returned a bad timestep: 9900 (src/fix_print.cpp:131)
 Two lines
 ^^^^^^^^^
 In addition to the single line output, also the last line of the input
 will be repeated.  If a command is spread over multiple lines in the
 input using the continuation character '&', then the error will print
 the entire concatenated line.  For readability all whitespace is
 compressed to single blanks.  Example:
 .. parsed-literal::
   ERROR: Unrecognized fix style 'printf' (src/modify.cpp:924)
   Last input line: fix 0 all printf v_nevery "Step: $(step) ${step}"
 Three lines
 ^^^^^^^^^^^
 In addition to the two line output from above, a third line is added
 that uses caret character markers '^' to indicate which "word" in the
 input failed.  Example:
 .. parsed-literal::
   ERROR: Illegal fix print nevery value -100; must be > 0 (src/fix_print.cpp:41)
   Last input line: fix 0 all print -100 "Step: $(step) ${stepx}"
                                    ^^^^
 Four lines
 ^^^^^^^^^^
 The three line output is expanded to four lines, if the the input is
 modified through input pre-processing, e.g. when substituting
 variables. Now the last command is printed once in the original form and
 a second time after substitutions are applied.  The caret character
 markers '^' are applied to the second version.  Example:
 .. parsed-literal::
   ERROR: Illegal fix print nevery value -100; must be > 0 (src/fix_print.cpp:41)
   Last input line: fix 0 all print ${nevery} 'Step: $(step) ${step}'
   --> parsed line: fix 0 all print -100 "Step: $(step) ${step}"
                                    ^^^^
--- a/doc/src/Speed_kokkos.rst
+++ b/doc/src/Speed_kokkos.rst
@ -67,6 +67,14 @@ version 23 November 2023 and Kokkos version 4.2.
   To build with Kokkos support for AMD GPUs, the AMD ROCm toolkit
   software version 5.2.0 or later must be installed on your system.
 .. admonition:: Intel Data Center GPU support
   :class: note
   Support for Kokkos with Intel Data Center GPU accelerators (formerly
   known under the code name "Ponte Vecchio") in LAMMPS is still a work
   in progress.  Only a subset of the functionality works correctly.
   Please contact the LAMMPS developers if you run into problems.
 .. admonition:: CUDA and MPI library compatibility
   :class: note
@ -80,13 +88,15 @@ version 23 November 2023 and Kokkos version 4.2.
   LAMMPS command-line or by using the command :doc:`package kokkos
   gpu/aware off <package>` in the input file.
-.. admonition:: Intel Data Center GPU support
+.. admonition:: Using multiple MPI ranks per GPU
   :class: note
-   Support for Kokkos with Intel Data Center GPU accelerators (formerly
+   Unlike with the GPU package, there are limited benefits from using
-   known under the code name "Ponte Vecchio") in LAMMPS is still a work
+   multiple MPI processes per GPU with KOKKOS.  But when doing this it
-   in progress.  Only a subset of the functionality works correctly.
+   is **required** to enable CUDA MPS (`Multi-Process Service :: GPU
-   Please contact the LAMMPS developers if you run into problems.
+   Deployment and Management Documentation
   <https://docs.nvidia.com/deploy/mps/index.html>`_ ) to get acceptable
   performance.
 Building LAMMPS with the KOKKOS package
 """""""""""""""""""""""""""""""""""""""
@ -365,13 +375,13 @@ one or more nodes, each with two GPUs:
 .. note::
-   When using a GPU, you will achieve the best performance if your
+   When using a GPU, you will achieve the best performance if your input
-   input script does not use fix or compute styles which are not yet
+   script does not use fix or compute styles which are not yet
   Kokkos-enabled. This allows data to stay on the GPU for multiple
   timesteps, without being copied back to the host CPU. Invoking a
-   non-Kokkos fix or compute, or performing I/O for
+   non-Kokkos fix or compute, or performing I/O for :doc:`thermo
-   :doc:`thermo <thermo_style>` or :doc:`dump <dump>` output will cause data
+   <thermo_style>` or :doc:`dump <dump>` output will cause data to be
-   to be copied back to the CPU incurring a performance penalty.
+   copied back to the CPU incurring a performance penalty.
 .. note::
@ -379,6 +389,56 @@ one or more nodes, each with two GPUs:
   kspace, etc., you must set the environment variable ``CUDA_LAUNCH_BLOCKING=1``.
   However, this will reduce performance and is not recommended for production runs.
 Troubleshooting segmentation faults on GPUs
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 As noted above, KOKKOS by default assumes that the MPI library is
 GPU-aware.  This is not always the case and can lead to segmentation
 faults when using more than one MPI process.  Normally, LAMMPS will
 print a warning like "*Turning off GPU-aware MPI since it is not
 detected*", or an error message like "*Kokkos with GPU-enabled backend
 assumes GPU-aware MPI is available*", OR a **segmentation fault**.  To
 confirm that a segmentation fault is caused by this, you can turn off
 the GPU-aware assumption via the :doc:`package kokkos command <package>`
 or the corresponding command-line flag.
 If you still get a segmentation fault, despite running with only one MPI
 process or using the command-line flag to turn off expecting a GPU-aware
 MPI library, then using the CMake compile setting
 ``-DKokkos_ENABLE_DEBUG=on`` or adding ``KOKKOS_DEBUG=yes`` to your
 machine makefile for building with traditional make will generate useful
 output that can be passed to the LAMMPS developers for further
 debugging.
 Troubleshooting memory allocation on GPUs
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 `Kokkos Tools <https://github.com/kokkos/kokkos-tools/>`_ provides a set
 of lightweight profiling and debugging utilities, which interface with
 instrumentation hooks (eg. `space-time-stack
 <https://github.com/kokkos/kokkos-tools/tree/develop/profiling/space-time-stack>`_)
 built directly into the Kokkos runtime.  After compiling a dynamic LAMMPS
 library, you then have to set the environment variable ``KOKKOS_TOOLS_LIBS``
 before executing your LAMMPS Kokkos run. Example:
 .. code-block:: bash
    export KOKKOS_TOOLS_LIBS=${HOME}/kokkos-tools/src/tools/memory-events/kp_memory_event.so
    mpirun -np 4 lmp_kokkos_cuda_openmpi -in in.lj -k on g 4 -sf kk
 Starting with the NVIDIA Pascal GPU architecture, CUDA supports
 `"Unified Virtual Memory" (UVM)
 <https://developer.nvidia.com/blog/unified-memory-cuda-beginners/>`_
 which enables allocating more memory than a GPU possesses by also using
 memory on the host CPU and then CUDA will transparently move data
 between CPU and GPU as needed.  The resulting LAMMPS performance depends
 on `memory access pattern, data residency, and GPU memory
 oversubscription
 <https://developer.nvidia.com/blog/improving-gpu-memory-oversubscription-performance/>`_
 . The CMake option ``-DKokkos_ENABLE_CUDA_UVM=on`` or the makefile
 setting ``KOKKOS_CUDA_OPTIONS=enable_lambda,force_uvm`` enables using
 :ref:`UVM with Kokkos <kokkos>` when compiling LAMMPS.
 Run with the KOKKOS package by editing an input script
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
--- a/doc/src/compute.rst
+++ b/doc/src/compute.rst
@ -236,6 +236,7 @@ The individual style names on the :doc:`Commands compute <Commands_compute>` pag
 * :doc:`fep/ta <compute_fep_ta>` - compute free energies for a test area perturbation
 * :doc:`force/tally <compute_tally>` - force between two groups of atoms via the tally callback mechanism
 * :doc:`fragment/atom <compute_cluster_atom>` - fragment ID for each atom
 * :doc:`gaussian/grid/local <compute_gaussian_grid_local>` - local array of Gaussian atomic contributions on a regular grid
 * :doc:`global/atom <compute_global_atom>` - assign global values to each atom from arrays of global values
 * :doc:`group/group <compute_group_group>` - energy/force between two groups of atoms
 * :doc:`gyration <compute_gyration>` - radius of gyration of group of atoms
--- a/doc/src/compute_gaussian_grid_local.rst
+++ b/doc/src/compute_gaussian_grid_local.rst
@ -0,0 +1,97 @@
 .. index:: compute gaussian/grid/local
 .. index:: compute gaussian/grid/local/kk
 compute gaussian/grid/local command
 ===================================
 Accelerator Variants: *gaussian/grid/local/kk*
 Syntax
 """"""
 .. code-block:: LAMMPS
   compute ID group-ID gaussian/grid/local grid nx ny nz rcutfac  R_1 R_2 ... sigma_1 sigma_2
 * ID, group-ID are documented in :doc:`compute <compute>` command
 * gaussian/grid/local = style name of this compute command
 * *grid* values = nx, ny, nz, number of grid points in x, y, and z directions (positive integer)
 * *rcutfac* = scale factor applied to all cutoff radii (positive real)
 * *R_1, R_2,...* = list of cutoff radii, one for each type (distance units)
 * *sigma_1, sigma_2,...* = Gaussian widths, one for each type (distance units)
 Examples
 """"""""
 .. code-block:: LAMMPS
    compute mygrid all gaussian/grid/local grid 40 40 40 4.0 0.5 0.5 0.4 0.4
 Description
 """""""""""
 Define a computation that calculates a Gaussian representation of the ionic
 structure. This representation is used for the efficient evaluation
 of quantities related to the structure factor in a grid-based workflow,
 such as the ML-DFT workflow MALA :ref:`(Ellis) <Ellis2021b>`, for which it was originally
 implemented. Usage of the workflow is described in a separate publication :ref:`(Fiedler) <Fiedler2023>`.
 For each LAMMPS type, a separate sum of Gaussians is calculated, using
 a separate Gaussian broadening per type. The computation
 is always performed on the numerical grid, no atom-based version of this
 compute exists. The Gaussian representation can only be executed in a local
 fashion, thus the output array only contains rows for grid points
 that are local to the processor subdomain. The layout of the grid is the same
 as for the see :doc:`sna/grid/local <compute_sna_atom>` command.
 Namely, the array contains one row for each of the
 local grid points, looping over the global index *ix* fastest,
 then *iy*, and *iz* slowest.  Each row of the array contains
 the global indexes *ix*, *iy*, and *iz* first, followed by the *x*, *y*,
 and *z* coordinates of the grid point, followed by the values of the Gaussians
 (one floating point number per type per grid point).
 ----------
 .. include:: accel_styles.rst
 ----------
 Output info
 """""""""""
 Compute *gaussian/grid/local* evaluates a local array.
 The array contains one row for each of the
 local grid points, looping over the global index *ix* fastest,
 then *iy*, and *iz* slowest.  The array contains math :math:`ntypes+6` columns,
 where *ntypes* is the number of LAMMPS types. The first three columns are
 the global indexes *ix*, *iy*, and *iz*, followed by the *x*, *y*,
 and *z* coordinates of the grid point, followed by the *ntypes* columns
 containing the values of the Gaussians for each type.
 Restrictions
 """"""""""""
 These computes are part of the ML-SNAP package.  They are only enabled
 if LAMMPS was built with that package.  See the :doc:`Build package
 <Build_package>` page for more info.
 Related commands
 """"""""""""""""
 :doc:`compute sna/grid/local <compute_sna_atom>`
 ----------
 .. _Ellis2021b:
 **(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) <https://doi.org/10.1103/PhysRevB.104.035120>`_
 .. _Fiedler2023:
 **(Fiedler)** Fiedler, Modine, Schmerler, Vogel, Popoola, Thompson, Rajamanickam, and Cangi,
 `npj Comp. Mater., 9, 115 (2023) <https://doi.org/10.1038/s41524-023-01070-z>`_
--- a/doc/src/compute_sna_atom.rst
+++ b/doc/src/compute_sna_atom.rst
@ -3,7 +3,9 @@
 .. index:: compute snav/atom
 .. index:: compute snap
 .. index:: compute sna/grid
 .. index:: compute sna/grid/kk
 .. index:: compute sna/grid/local
 .. index:: compute sna/grid/local/kk
 compute sna/atom command
 ========================
@ -20,9 +22,14 @@ compute snap command
 compute sna/grid command
 ========================
 compute sna/grid/kk command
 ===========================
 compute sna/grid/local command
 ==============================
 Accelerator Variants: *sna/grid/local/kk*
 Syntax
 """"""
@ -33,17 +40,17 @@ Syntax
   compute ID group-ID snav/atom rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
   compute ID group-ID snap rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
   compute ID group-ID snap rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
-   compute ID group-ID sna/grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
+   compute ID group-ID sna/grid grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
-   compute ID group-ID sna/grid/local nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
+   compute ID group-ID sna/grid/local grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
 * ID, group-ID are documented in :doc:`compute <compute>` command
 * sna/atom = style name of this compute command
-* rcutfac = scale factor applied to all cutoff radii (positive real)
+* *rcutfac* = scale factor applied to all cutoff radii (positive real)
-* rfac0 = parameter in distance to angle conversion (0 < rcutfac < 1)
+* *rfac0* = parameter in distance to angle conversion (0 < rcutfac < 1)
-* twojmax = band limit for bispectrum components (non-negative integer)
+* *twojmax* = band limit for bispectrum components (non-negative integer)
-* R_1, R_2,... = list of cutoff radii, one for each type (distance units)
+* *R_1, R_2,...* = list of cutoff radii, one for each type (distance units)
-* w_1, w_2,... = list of neighbor weights, one for each type
+* *w_1, w_2,...* = list of neighbor weights, one for each type
-* nx, ny, nz = number of grid points in x, y, and z directions (positive integer)
+* *grid* values = nx, ny, nz, number of grid points in x, y, and z directions (positive integer)
 * zero or more keyword/value pairs may be appended
 * keyword = *rmin0* or *switchflag* or *bzeroflag* or *quadraticflag* or *chem* or *bnormflag* or *wselfallflag* or *bikflag* or *switchinnerflag* or *sinner* or *dinner* or *dgradflag* or *nnn* or *wmode* or *delta*
@ -103,7 +110,7 @@ Examples
   compute snap all snap 1.4 0.95 6 2.0 1.0
   compute snap all snap 1.0 0.99363 6 3.81 3.83 1.0 0.93 chem 2 0 1
   compute snap all snap 1.0 0.99363 6 3.81 3.83 1.0 0.93 switchinnerflag 1 sinner 1.35 1.6 dinner 0.25 0.3
-   compute bgrid all sna/grid/local 200 200 200 1.4 0.95 6 2.0 1.0
+   compute bgrid all sna/grid/local grid 200 200 200 1.4 0.95 6 2.0 1.0
   compute bnnn all sna/atom 9.0 0.99363 8 0.5 1.0 rmin0 0.0 nnn 24 wmode 1 delta 0.2
 Description
@ -252,7 +259,8 @@ for finite-temperature Kohn-Sham density functional theory (:ref:`Ellis
 et al. <Ellis2021>`) Neighbor atoms not in the group do not contribute
 to the bispectrum components of the grid points. The distance cutoff
 :math:`R_{ii'}` assumes that *i* has the same type as the neighbor atom
-*i'*.
+*i'*. Both computes can be hardware accelerated with Kokkos by using the
 *sna/grid/kk* and *sna/grid/local/kk* commands, respectively.
 Compute *sna/grid* calculates a global array containing bispectrum
 components for a regular grid of points.
@ -463,6 +471,12 @@ fluctuations in the resulting local atomic environment fingerprint.  The
 detailed formalism is given in the paper by Lafourcade et
 al. :ref:`(Lafourcade) <Lafourcade2023_2>`.
 ----------
 .. include:: accel_styles.rst
 ----------
 Output info
@ -654,7 +668,7 @@ of Angular Momentum, World Scientific, Singapore (1987).
 .. _Ellis2021:
-**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam,  Phys Rev B, 104, 035120, (2021)
+**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) <https://doi.org/10.1103/PhysRevB.104.035120>`_
 .. _Lafourcade2023_2:
--- a/doc/src/delete_bonds.rst
+++ b/doc/src/delete_bonds.rst
@ -62,6 +62,18 @@ For all styles, by default, an interaction is only turned off (or on)
 if all the atoms involved are in the specified group.  See the *any*
 keyword to change the behavior.
 .. admonition:: Possible errors caused by using *delete_bonds*
   :class: warning
   Since this command by default only *turns off* bonded interactions,
   their definitions are still present and subject to the limitations
   due to LAMMPS' domain decomposition based parallelization.  That is,
   when a bond is turned off, the two constituent atoms may move apart
   and may reach a distance where they can lead to a "bond atoms missing"
   error and crash the simulation.  Adding the *remove* keyword (see
   below) is required to fully remove those interactions and prevent
   the error.
 Several of the styles (\ *atom*, *bond*, *angle*, *dihedral*, *improper*\ )
 take a *type* as an argument.  The specified *type* can be a
 :doc:`type label <Howto_type_labels>`.  Otherwise, the type should be an
@ -98,15 +110,18 @@ of all interactions in the specified group is simply reported.  This
 is useful for diagnostic purposes if bonds have been turned off by a
 bond-breaking potential during a previous run.
-The default behavior of the delete_bonds command is to turn off
+.. admonition:: Impact on special_bonds processing and exclusions
-interactions by toggling their type to a negative value, but not to
+   :class: note
-permanently remove the interaction.  For example, a bond_type of 2 is set to
+
-:math:`-2.`  The neighbor list creation routines will not include such an
+   The default behavior of the delete_bonds command is to turn off
-interaction in their interaction lists.  The default is also to not
+   interactions by toggling their type to a negative value, but not to
-alter the list of 1--2, 1--3, or 1--4 neighbors computed by the
+   permanently remove the interaction.  For example, a bond_type of 2 is set to
-:doc:`special_bonds <special_bonds>` command and used to weight pairwise
+   :math:`-2.`  The neighbor list creation routines will not include such an
-force and energy calculations.  This means that pairwise computations
+   interaction in their interaction lists.  The default is also to not
-will proceed as if the bond (or angle, etc.) were still turned on.
+   alter the list of 1--2, 1--3, or 1--4 neighbors computed by the
   :doc:`special_bonds <special_bonds>` command and used to weight pairwise
   force and energy calculations.  This means that pairwise computations
   will proceed as if the bond (or angle, etc.) were still turned on.
 Several keywords can be appended to the argument list to alter the
 default behaviors.
@ -138,9 +153,11 @@ operation, after (optional) removal.  It re-computes the pairwise 1--2,
 turned-off bonds the same as turned-on.  Thus, turned-off bonds must
 be removed if you wish to change the weighting list.
-Note that the choice of *remove* and *special* options affects how
+.. note::
-1--2, 1--3, 1--4 pairwise interactions will be computed across bonds that
+
-have been modified by the delete_bonds command.
+   The choice of *remove* and *special* options affects how 1--2,
   1--3, 1--4 pairwise interactions will be computed across bonds
   that have been modified by the delete_bonds command.
 Restrictions
 """"""""""""
--- a/doc/src/fix.rst
+++ b/doc/src/fix.rst
@ -237,6 +237,7 @@ accelerated styles exist.
 * :doc:`dt/reset <fix_dt_reset>` - reset the timestep based on velocity, forces
 * :doc:`edpd/source <fix_dpd_source>` - add heat source to eDPD simulations
 * :doc:`efield <fix_efield>` - impose electric field on system
 * :doc:`efield/lepton <fix_efield_lepton>` - impose electric field on system using a Lepton expression for the potential
 * :doc:`efield/tip4p <fix_efield>` - impose electric field on system with TIP4P molecules
 * :doc:`ehex <fix_ehex>` - enhanced heat exchange algorithm
 * :doc:`electrode/conp <fix_electrode>` - impose electric potential
--- a/doc/src/fix_efield.rst
+++ b/doc/src/fix_efield.rst
@ -45,8 +45,9 @@ Description
 Add a force :math:`\vec{F} = q\vec{E}` to each charged atom in the group due to an
 external electric field being applied to the system.  If the system
-contains point-dipoles, also add a torque on the dipoles due to the
+contains point-dipoles, also add a torque :math:`\vec{T} = \vec{p} \times \vec{E}` on the dipoles due to the
-external electric field.
+external electric field. This fix does not compute the dipole force :math:`\vec{F} = (\vec{p} \cdot \nabla) \vec{E}`,
 and the :doc:`fix efield/lepton <fix_efield_lepton>` command should be used instead.
 .. versionadded:: 28Mar2023
@ -68,6 +69,7 @@ For point-dipoles, equal-style variables can be used, but atom-style
 variables are not currently supported, since they imply a spatial
 gradient in the electric field which means additional terms with
 gradients of the field are required for the force and torque on dipoles.
 The :doc:`fix efield/lepton <fix_efield_lepton>` command should be used instead.
 Equal-style variables can specify formulas with various mathematical
 functions, and include :doc:`thermo_style <thermo_style>` command
@ -229,7 +231,7 @@ Fix style *efield/tip4p* can only be used with tip4p pair styles.
 Related commands
 """"""""""""""""
-:doc:`fix addforce <fix_addforce>`
+:doc:`fix addforce <fix_addforce>`, :doc:`fix efield/lepton <fix_efield_lepton>`
 Default
 """""""
--- a/doc/src/fix_efield_lepton.rst
+++ b/doc/src/fix_efield_lepton.rst
@ -0,0 +1,143 @@
 .. index:: fix efield/lepton
 fix efield/lepton command
 =========================
 Syntax
 """"""
 .. code-block:: LAMMPS
   fix ID group-ID efield/lepton V ...
 * ID, group-ID are documented in the :doc:`fix <fix>` command
 * style = *efield/lepton*
 * V = electric potential (electric field * distance units)
 * V must be a Lepton expression (see below)
 * zero or more keyword/value pairs may be appended to args
 * keyword = *region* or *step*
  .. parsed-literal::
       *region* value = region-ID
         region-ID = ID of region atoms must be in to have effect
       *step* value = h
         h = step size for numerical differentiation (distance units)
 Examples
 """"""""
 .. code-block:: LAMMPS
   fix ex all efield/lepton "-E*x; E=1"
   fix dexx all efield/lepton "-0.5*x^2" step 1
   fix yukawa all efield/lepton "A*exp(-B*r)/r; r=abs(sqrt(x^2+y^2+z^2)); A=1; B=1" step 1e-6
   fix infp all efield/lepton "-abs(x)" step 1
   variable th equal 2*PI*ramp(0,1)
   fix erot all efield/lepton "-(x*cos(v_th)+y*sin(v_th))"
 Description
 """""""""""
 .. versionadded:: TBD
 Add an electric potential :math:`V` that applies to a group of charged atoms a force :math:`\vec{F} = q \vec{E}`,
 and to dipoles a force :math:`\vec{F} = (\vec{p} \cdot \nabla) \vec{E}` and torque :math:`\vec{T} = \vec{p} \times \vec{E}`,
 where :math:`\vec{E} = - \nabla V`. The fix also evaluates the electrostatic energy (:math:`U_{q} = q V` and :math:`U_{p} = - \vec{p} \cdot \vec{E}`)
 due to this potential when the :doc:`fix_modify energy yes <fix_modify>` command is specified (see below).
 .. note::
   This command should be used instead of :doc:`fix efield <fix_efield>` if you want to impose a non-uniform electric field on a system with dipoles
   since the latter does not include the dipole force term. If you only have charges or if the electric field gradient is negligible,
   :doc:`fix efield <fix_efield>` should be used since it is faster.
 The `Lepton library <https://simtk.org/projects/lepton>`_, that the *efield/lepton* fix style interfaces with, evaluates
 the expression string at run time to compute the energy, forces, and torques. It creates an analytical representation
 of :math:`V` and :math:`\vec{E}`, while the gradient force is computed using a central difference scheme
 .. math::
   \vec{F} = \frac{|\vec{p}|}{2h} \left[ \vec{E}(\vec{x} + h \hat{p}) - \vec{E}(\vec{x} - h \hat{p}) \right] .
 The Lepton expression must be either enclosed in quotes or must not contain any whitespace so that LAMMPS
 recognizes it as a single keyword. More on valid Lepton expressions below. The final Lepton expression must
 be a function of only :math:`x, y, z`, which refer to the current *unwrapped* coordinates of the atoms to ensure continuity.
 Special care must be taken when using this fix with periodic boundary conditions or box-changing commands.
 ----------
 .. include:: lepton_expression.rst
 ----------
 If the *region* keyword is used, the atom must also be in the specified
 geometric :doc:`region <region>` in order to be affected by the potential.
 The *step* keyword is required when :doc:`atom_style dipole <atom_style>` is used and the electric field is non-uniform.
 ----------
 Restart, fix_modify, output, run start/stop, minimize info
 """""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 No information about this fix is written to :doc:`binary restart files
 <restart>`.
 The :doc:`fix_modify <fix_modify>` *energy* option is supported by this
 fix to add the potential energy defined above to the global potential energy
 of the system as part of :doc:`thermodynamic output <thermo_style>`.
 The default setting for this fix is :doc:`fix_modify energy no <fix_modify>`.
 The :doc:`fix_modify <fix_modify>` *virial* option is supported by this
 fix to add the contribution due to the added ***forces*** on charges and dipoles
 to both the global pressure and per-atom stress of the system via the
 :doc:`compute pressure <compute_pressure>` and :doc:`compute stress/atom
 <compute_stress_atom>` commands. The former can be accessed by
 :doc:`thermodynamic output <thermo_style>`. The default setting for
 this fix is :doc:`fix_modify virial no <fix_modify>`.
 The :doc:`fix_modify <fix_modify>` *respa* option is supported by this
 fix. This allows to set at which level of the :doc:`r-RESPA <run_style>`
 integrator the fix adding its forces. Default is the outermost level.
 This fix computes a global scalar and a global 3-vector of forces,
 which can be accessed by various :doc:`output commands <Howto_output>`.
 The scalar is the potential energy discussed above.
 The vector is the total force added to the group of atoms.
 The scalar and vector values calculated by this fix are "extensive".
 This fix cannot be used with the *start/stop* keywords of
 the :doc:`run <run>` command.
 The forces due to this fix are imposed during an energy minimization,
 invoked by the :doc:`minimize <minimize>` command. You should not
 specify force components with a variable that has time-dependence for
 use with a minimizer, since the minimizer increments the timestep as
 the iteration count during the minimization.
 .. note::
   If you want the electric potential energy to be included in the
   total potential energy of the system (the quantity being minimized),
   you MUST enable the :doc:`fix_modify <fix_modify>` *energy* option for this fix.
 ----------
 Restrictions
 """"""""""""
 Fix style *efield/lepton* is part of the LEPTON package. It is only enabled if LAMMPS was built with that package.
 See the :doc:`Build package <Build_package>` page for more info.
 Related commands
 """"""""""""""""
 :doc:`fix efield <fix_efield>`
 Default
 """""""
 none
--- a/doc/src/fix_imd.rst
+++ b/doc/src/fix_imd.rst
@ -26,6 +26,29 @@ Syntax
       *nowait* arg = *on* or *off*
         off = LAMMPS waits to be connected to an IMD client before continuing (default)
         on = LAMMPS listens for an IMD client, but continues with the run
       *version* arg = *2* or *3*
         2 = use IMD protocol version 2 (default)
         3 = use IMD protocol version 3.
  The following keywords are only supported for IMD protocol version 3.
  .. parsed-literal::
       *time* arg = *on* or *off*
          off = simulation time is not transmitted (default)
          on = simulation time is transmitted.
       *box* arg = *on* or *off*
          off = simulation box data is not transmitted (default)
          on = simulation box data is transmitted.
       *coordinates* arg = *on* or *off*
          off = atomic coordinates are not transmitted (default)
          on = atomic coordinates are transmitted.
       *velocities* arg = *on* or *off*
          off = atomic velocities are not transmitted (default)
          on = atomic velocities are transmitted.
       *forces* arg = *on* or *off*
          off = atomic forces are not transmitted (default)
          on = atomic forces are transmitted.
 Examples
 """"""""
@ -40,16 +63,19 @@ Description
 This fix implements the "Interactive MD" (IMD) protocol which allows
 realtime visualization and manipulation of MD simulations through the
-IMD protocol, as initially implemented in VMD and NAMD.  Specifically
+IMD protocol, as initially implemented in VMD and NAMD.  Specifically it
-it allows LAMMPS to connect an IMD client, for example the `VMD visualization program <VMD_>`_, so that it can monitor the progress of the
+allows LAMMPS to connect an IMD client, for example the `VMD
-simulation and interactively apply forces to selected atoms.
+visualization program <VMD_>`_ (currently only supports IMDv2) or the
 `Python IMDClient <IMDClient_>`_ (supports both IMDv2 and IMDv3), so
 that it can monitor the progress of the simulation and interactively
 apply forces to selected atoms.
-If LAMMPS is compiled with the pre-processor flag -DLAMMPS_ASYNC_IMD
+If LAMMPS is compiled with the pre-processor flag
-then fix imd will use POSIX threads to spawn a IMD communication
+:ref:`-DLAMMPS_ASYNC_IMD <misc>` then fix imd will use POSIX threads to
-thread on MPI rank 0 in order to offload data reading and writing
+spawn an IMD communication thread on MPI rank 0 in order to offload data
-from the main execution thread and potentially lower the inferred
+exchange with the IMD client from the main execution thread and
-latencies for slow communication links. This feature has only been
+potentially lower the inferred latencies for slow communication
-tested under linux.
+links. This feature has only been tested under linux.
 The source code for this fix includes code developed by the Theoretical
 and Computational Biophysics Group in the Beckman Institute for Advanced
@ -94,6 +120,15 @@ with different units or as a measure to tweak the forces generated by
 the manipulation of the IMD client, this option allows to make
 adjustments.
 .. versionadded:: TBD
 In `IMDv3 <IMDv3_>`_, the IMD protocol has been extended to allow for
 the transmission of simulation time, box dimensions, atomic coordinates,
 velocities, and forces. The *version* keyword allows to select the
 version of the protocol to be used. The *time*, *box*, *coordinates*,
 *velocities*, and *forces* keywords allow to select which data is
 transmitted to the IMD client. The default is to transmit all data.
 To connect VMD to a listening LAMMPS simulation on the same machine
 with fix imd enabled, one needs to start VMD and load a coordinate or
 topology file that matches the fix group.  When the VMD command
@ -129,6 +164,10 @@ screen output is active.
 .. _imdvmd: https://www.ks.uiuc.edu/Research/vmd/imd/
 .. _IMDClient: https://github.com/Becksteinlab/imdclient/tree/main/imdclient
 .. _IMDv3: https://imdclient.readthedocs.io/en/latest/protocol_v3.html
 Restart, fix_modify, output, run start/stop, minimize info
 """""""""""""""""""""""""""""""""""""""""""""""""""""""""""
@ -147,14 +186,14 @@ This fix is part of the MISC package.  It is only enabled if LAMMPS was
 built with that package.  See the :doc:`Build package <Build_package>`
 page for more info.
-When used in combination with VMD, a topology or coordinate file has
+When used in combination with VMD, a topology or coordinate file has to
-to be loaded, which matches (in number and ordering of atoms) the
+be loaded, which matches (in number and ordering of atoms) the group the
-group the fix is applied to. The fix internally sorts atom IDs by
+fix is applied to.  The fix internally sorts atom IDs by ascending
-ascending integer value; in VMD (and thus the IMD protocol) those will
+integer value; in VMD (and thus the IMD protocol) those will be assigned
-be assigned 0-based consecutive index numbers.
+0-based consecutive index numbers.
 When using multiple active IMD connections at the same time, each
-needs to use a different port number.
+fix instance needs to use a different port number.
 Related commands
 """"""""""""""""
--- a/doc/src/pair_dispersion_d3.rst
+++ b/doc/src/pair_dispersion_d3.rst
@ -0,0 +1,158 @@
 .. index:: pair_style dispersion/d3
 pair_style dispersion/d3 command
 ================================
 Syntax
 """"""
 .. code-block:: LAMMPS
   pair_style dispersion/d3 damping functional cutoff cn_cutoff
 * damping = damping function: *zero*, *zerom*, *bj*, or *bjm*
 * functional = XC functional form: *pbe*, *pbe0*, ... (see list below)
 * cutoff = global cutoff (distance units)
 * cn_cutoff = coordination number cutoff (distance units)
 Examples
 """"""""
 .. code-block:: LAMMPS
   pair_style dispersion/d3 zero pbe 30.0 20.0
   pair_coeff * * C
 Description
 """""""""""
 .. versionadded:: TBD
 Style *dispersion/d3* computes the dispersion energy-correction used in
 the DFT-D3 method of Grimme :ref:`(Grimme1) <Grimme1>`.  It would
 typically be used with a machine learning (ML) potential that was
 trained with results from plain DFT calculations without the dispersion
 correction through pair_style hybrid/overlay. ML potentials are often
 combined *a posteriori* with dispersion energy-correction schemes (see
 *e.g.* :ref:`(Qamar) <Qamar>` and :ref:`(Batatia) <Batatia>`).
 The energy contribution :math:`E_i` for an atom :math:`i` is given by:
 .. math::
   E_i = \frac{1}{2} \sum_{j \neq i} \big(
                s_6 \frac{C_{6,ij}}{r^6_{ij}} f_6^{damp}(r_{ij}) +
                s_8 \frac{C_{8,ij}}{r^8_{ij}} f_8^{damp}(r_{ij}) \big)
 where :math:`C_n` is the averaged, geometry-dependent nth-order
 dispersion coefficient for atom pair :math:`ij`, :math:`r_{ij}` their
 inter-nuclear distance, :math:`s_n` are XC functional-dependent scaling
 factor, and :math:`f_n^{damp}` are damping functions.
 .. note::
   It is currently *not* possible to calculate three-body dispersion
   contributions, according to, for example, the Axilrod-Teller-Muto
   model.
 Available damping functions are the original "zero-damping"
 :ref:`(Grimme1) <Grimme1>`, Becke-Johnson damping :ref:`(Grimme2)
 <Grimme2>`, and their revised forms :ref:`(Sherrill) <Sherrill>`.
 Available XC functional scaling factors are listed in the table below,
 and depend on the selected damping function.
 +------------------+--------------------------------------------------------------------------------+
 | Damping function | XC functional                                                                  |
 +==================+================================================================================+
 | |                | | slater-dirac-exchange, b-lyp, b-p, b97-d, revpbe, pbe, pbesol, rpw86-pbe,    |
 | |                | | rpbe, tpss, b3-lyp, pbe0, hse06, revpbe38, pw6b95, tpss0, b2-plyp, pwpb95,   |
 | | zero           | | b2gp-plyp, ptpss, hf, mpwlyp, bpbe, bh-lyp, tpssh, pwb6k, b1b95, bop, o-lyp, |
 | |                | | o-pbe, ssb, revssb, otpss, b3pw91, revpbe0, pbe38, mpw1b95, mpwb1k, bmk,     |
 | |                | | cam-b3lyp, lc-wpbe, m05, m052x, m06l, m06, m062x, m06hf, hcth120             |
 +------------------+--------------------------------------------------------------------------------+
 |   zerom          |   b2-plyp, b3-lyp, b97-d, b-lyp, b-p, pbe, pbe0, lc-wpbe                       |
 +------------------+--------------------------------------------------------------------------------+
 | |                | | b-p, b-lyp, revpbe, rpbe, b97-d, pbe, rpw86-pbe, b3-lyp, tpss, hf, tpss0,    |
 | |                | | pbe0, hse06, revpbe38, pw6b95, b2-plyp, dsd-blyp, dsd-blyp-fc, bop, mpwlyp,  |
 | | bj             | | o-lyp, pbesol, bpbe, opbe, ssb, revssb, otpss, b3pw91, bh-lyp, revpbe0,      |
 | |                | | tpssh, mpw1b95, pwb6k, b1b95, bmk, cam-b3lyp, lc-wpbe, b2gp-plyp, ptpss,     |
 | |                | | pwpb95, hf/mixed, hf/sv, hf/minis, b3lyp/6-31gd, hcth120, pw1pw, pwgga,      |
 | |                | | hsesol, hf3c, hf3cv, pbeh3c, pbeh-3c                                         |
 +------------------+--------------------------------------------------------------------------------+
 | bjm              |  b2-plyp, b3-lyp, b97-d, b-lyp, b-p, pbe, pbe0, lc-wpbe                        |
 +------------------+--------------------------------------------------------------------------------+
 This style is primarily supposed to be used combined with a
 machine-learned interatomic potential trained on a DFT dataset (the
 selected XC functional should be chosen accordingly) via the
 :doc:`pair_style hybrid <pair_hybrid>` command.
 Coefficients
 """"""""""""
 All the required coefficients are already stored internally (in the
 ``src/EXTRA-PAIR/d3_parameters.h`` file).  The only information to
 provide are the chemical symbols of the atoms.  The number of chemical
 symbols given must be equal to the number of atom types used and must
 match their ordering as atom types.
 Mixing, shift, table, tail correction, restart, rRESPA info
 """""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 This pair style does not support mixing since all parameters are
 explicit for each pair of atom types.
 This pair style does not support the :doc:`pair_modify` shift, table,
 and tail options.
 This pair style does not write its information to :doc:`binary restart
 files <restart>`.
 This pair style can only be used via the *pair* keyword of the
 :doc:`run_style respa <run_style>` command.  It does not support the
 *inner*, *middle*, *outer* keywords.
 Restrictions
 """"""""""""
 Style *dispersion/d3* is part of the EXTRA-PAIR package. It is only
 enabled if LAMMPS was built with that package.  See the :doc:`Build
 package <Build_package>` page for more info.
 It is currently *not* possible to calculate three-body dispersion
 contributions according to, for example, the Axilrod-Teller-Muto model.
 Related commands
 """"""""""""""""
 :doc:`pair_coeff <pair_coeff>`
 Default
 """""""
 none
 ----------
 .. _Grimme1:
 **(Grimme1)** S. Grimme, J. Antony, S. Ehrlich, and H. Krieg, J. Chem. Phys. 132, 154104 (2010).
 .. _Qamar:
 **(Qamar)** M. Qamar, M. Mrovec, T. Lysogorskiy, A. Bochkarev, and R. Drautz, J. Chem. Theory Comput. 19, 5151 (2023).
 .. _Batatia:
 **(Batatia)** I. Batatia, *et al.*, arXiv:2401.0096 (2023).
 .. _Grimme2:
 **(Grimme2)** S. Grimme, S. Ehrlich and L. Goerigk,  J. Comput. Chem. 32, 1456 (2011).
 .. _Sherrill:
 **(Sherrill)** D. G. A. Smith, L. A. Burns, K. Patkowski, and C. D. Sherrill, J. Phys. Chem. Lett., 7, 2197, (2016).
--- a/doc/src/pair_hybrid.rst
+++ b/doc/src/pair_hybrid.rst
@ -70,6 +70,12 @@ Examples
   pair_coeff 1 1 lj/cut 1.0 1.0 2.5
   pair_coeff 1 1 morse 1.0 1.0 1.0 2.5
   variable peratom1 atom 1/(1+exp(-$k*vx^2)
   variable peratom2 atom 1-v_peratom1
   pair_style hybrid/scaled v_peratom1 lj/cut 2.5 v_peratom2 morse 2.5
   pair_coeff 1 1 lj/cut 1.0 1.0 2.5
   pair_coeff 1 1 morse 1.0 1.0 1.0 2.5
 Description
 """""""""""
@ -78,7 +84,7 @@ styles enable the use of multiple pair styles in one simulation.  With
 the *hybrid* style, exactly one pair style is assigned to each pair of
 atom types.  With the *hybrid/overlay* and *hybrid/scaled* styles, one
 or more pair styles can be assigned to each pair of atom types.  With
-the hybrid/molecular style, pair styles are assigned to either intra-
+the *hybrid/molecular* style, pair styles are assigned to either intra-
 or inter-molecular interactions.
 The assignment of pair styles to type pairs is made via the
@ -114,16 +120,26 @@ restrictions discussed below.
 If the *hybrid/scaled* style is used instead of *hybrid/overlay*,
 contributions from sub-styles are weighted by their scale factors, which
-may be fractional or even negative.  Furthermore the scale factors may
+may be fractional or even negative.  Furthermore the scale factor for
-be variables that may change during a simulation.  This enables
+each sub-style may a constant, an *equal* style variable, or an *atom*
 style variable. Variable scale factors may change during the simulation.
 Different sub-styles may use different scale factor styles.
 In the case of a sub-style scale factor that is an *atom* style variable,
 the force contribution to each atom from that sub-style is weighted
 by the value of the variable for that atom, while the contribution
 from that sub-style to the global potential energy is zero.
 All other contributions to the per-atom energy, per-atom
 virial, and global virial (if not obtained from forces)
 from that sub-style are zero.
 This enables
 switching smoothly between two different pair styles or two different
 parameter sets during a run in a similar fashion as could be done
 with :doc:`fix adapt <fix_adapt>` or :doc:`fix alchemy <fix_alchemy>`.
 All pair styles that will be used are listed as "sub-styles" following
 the *hybrid* or *hybrid/overlay* keyword, in any order.  In case of the
 *hybrid/scaled* pair style, each sub-style is prefixed with a scale
-factor.  The scale factor is either a floating point number or an equal
+factor.  The scale factor is either a floating point number or an
 *equal* or *atom*
 style (or equivalent) variable.  Each sub-style's name is followed by
 its usual arguments, as illustrated in the examples above.  See the doc
 pages of the individual pair styles for a listing and explanation of the
@ -374,7 +390,7 @@ between all atoms of types 1,3,4 will be computed by that potential.
 Pair_style hybrid allows interactions between type pairs 2-2, 1-2,
 2-3, 2-4 to be specified for computation by other pair styles.  You
 could even add a second interaction for 1-1 to be computed by another
-pair style, assuming pair_style hybrid/overlay is used.
+pair style, assuming pair_style *hybrid/overlay* is used.
 But you should not, as a general rule, attempt to exclude the many-body
 interactions for some subset of the type pairs within the set of 1,3,4
@ -414,7 +430,7 @@ passed to the Tersoff potential, which means it would compute no
 3-body interactions containing both type 1 and 2 atoms.
 Here is another example to use 2 many-body potentials together in an
-overlapping manner using hybrid/overlay.  Imagine you have CNT (C atoms)
+overlapping manner using *hybrid/overlay*.  Imagine you have CNT (C atoms)
 on a Si surface.  You want to use Tersoff for Si/Si and Si/C
 interactions, and AIREBO for C/C interactions.  Si atoms are type 1; C
 atoms are type 2.  Something like this will work:
--- a/doc/src/pair_style.rst
+++ b/doc/src/pair_style.rst
@ -172,6 +172,7 @@ accelerated styles exist.
 * :doc:`coul/tt <pair_coul_tt>` - damped charge-dipole Coulomb for Drude dipoles
 * :doc:`coul/wolf <pair_coul>` - Coulomb via Wolf potential
 * :doc:`coul/wolf/cs <pair_cs>` - Coulomb via Wolf potential with core/shell adjustments
 * :doc:`dispersion/d3 <pair_dispersion_d3>` - Dispersion correction for potentials derived from DFT functionals
 * :doc:`dpd <pair_dpd>` - dissipative particle dynamics (DPD)
 * :doc:`dpd/coul/slater/long <pair_dpd_coul_slater_long>` - dissipative particle dynamics (DPD) with electrostatic interactions
 * :doc:`dpd/ext <pair_dpd_ext>` - generalized force field for DPD
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@ -25,6 +25,7 @@ Ackland
 acks
 acolor
 acos
 acs
 Acta
 actinide
 activationfunctions
@ -248,6 +249,7 @@ basename
 Bashford
 bashrc
 Baskes
 Batatia
 Batra
 Bayly
 bb
@ -255,6 +257,7 @@ bcc
 bcolor
 bdiam
 bdw
 Becke
 Beckman
 Becton
 Behler
@ -313,6 +316,7 @@ bitrate
 bitrates
 Bitzek
 Bjerrum
 bjm
 Bkappa
 blabel
 Blaise
@ -1232,6 +1236,7 @@ fp
 fphi
 fPIC
 fplo
 fprintf
 Fqq
 Fraige
 framerate
@ -1342,6 +1347,7 @@ gmres
 gname
 gneb
 GNEB
 Goerigk
 Goga
 Goldfarb
 Gompper
@ -1707,6 +1713,7 @@ Jaramillo
 Jarzynski
 jatempl
 javascript
 jcc
 jcp
 jea
 jec
@ -1728,6 +1735,7 @@ Jonsson
 Jorgensen
 jp
 jparam
 jpclett
 jpeg
 jpeglib
 jpg
@ -1844,6 +1852,7 @@ Krass
 Kraus
 Kremer
 Kress
 Krieg
 Kronik
 ksh
 kspace
@ -1916,6 +1925,7 @@ lB
 lbfgs
 lbl
 LBtype
 lc
 lcbop
 ld
 lda
@ -2062,6 +2072,7 @@ ly
 Lybrand
 Lykotrafitis
 lyon
 lyp
 Lysogorskiy
 Lyulin
 lz
@ -2827,12 +2838,14 @@ pathangle
 pathname
 pathnames
 Patera
 Patkowski
 Patomtrans
 Pattnaik
 Pavese
 Pavia
 Paxton
 pbc
 pbe
 pc
 pcg
 pchain
@ -2923,6 +2936,7 @@ ploop
 PloS
 plt
 plumedfile
 plyp
 pmb
 pmcmoves
 pme
@ -3065,6 +3079,7 @@ qE
 qeff
 qelectron
 qeq
 Qamar
 QeQ
 QEq
 qfactor
@ -3366,6 +3381,7 @@ Schilfgarde
 Schimansky
 Schiotz
 Schlitter
 Schmerler
 Schmid
 Schnieders
 Schoen
@ -3432,6 +3448,7 @@ Shardlow
 shawn
 Shen
 Shenderova
 Sherrill
 Shi
 Shiga
 Shinoda
@ -4027,6 +4044,7 @@ VMDARCH
 VMDHOME
 vn
 Voigt
 Vogel
 volfactor
 Volkov
 Volpe
@ -4118,6 +4136,7 @@ workflows
 Workum
 Worley
 wormlike
 wpbe
 Wriggers
 writedata
 Wuppertal
@ -4238,6 +4257,7 @@ Zemer
 zenodo
 Zentrum
 Zepeda
 zerom
 zflag
 Zhang
 Zhao
--- a/examples/COUPLE/plugin/liblammpsplugin.c
+++ b/examples/COUPLE/plugin/liblammpsplugin.c
@ -117,6 +117,7 @@ liblammpsplugin_t *liblammpsplugin_load(const char *lib)
  ADDSYM(set_string_variable);
  ADDSYM(set_internal_variable);
  ADDSYM(variable_info);
  ADDSYM(eval);
  ADDSYM(gather_atoms);
  ADDSYM(gather_atoms_concat);
--- a/examples/COUPLE/plugin/liblammpsplugin.h
+++ b/examples/COUPLE/plugin/liblammpsplugin.h
@ -163,6 +163,7 @@ struct _liblammpsplugin {
  int (*set_string_variable)(void *, const char *, const char *);
  int (*set_internal_variable)(void *, const char *, double);
  int (*variable_info)(void *, int, char *, int);
  double (*eval)(void *, const char *);
  void (*gather_atoms)(void *, const char *, int, int, void *);
  void (*gather_atoms_concat)(void *, const char *, int, int, void *);
--- a/examples/LEPTON/in.efield-lepton
+++ b/examples/LEPTON/in.efield-lepton
@ -0,0 +1,50 @@
 # Point dipoles in a 3d box with an external potential (ignoring dipolar interactions)
 units           lj
 atom_style      hybrid sphere dipole
 dimension       3
 boundary        s s s
 region          box block -2 2 -2 2 -2 2
 create_box      1 box
 create_atoms    1 random 100 12345 NULL
 # need both mass settings due to hybrid atom style
 mass            1 1.0
 set             group all mass 1.0
 set             group all diameter 0.1
 set             group all dipole/random 98934 0.01
 pair_style      none
 comm_modify     cutoff 3.0
 velocity        all create 0.0 87287 mom yes rot yes
 fix             1 all nve/sphere update dipole
 ###############################################################################################################
 ## Yukawa potential
 #fix            2 all efield/lepton "A*exp(-B*r)/r; r=abs(sqrt(x^2+y^2+z^2)); A = 0.1; B = 5" step 1e-8
 ## Gradually increasing uniform field 
 #variable       E equal ramp(0,1)
 #fix            2 all efield/lepton "-v_E*(x+y+z)"
 ## Linear gradient field
 fix             2 all efield/lepton "-1/6*x^3" step 1e-6
 fix_modify      2 energy yes
 ###############################################################################################################
 timestep        1e-3
 compute         erot all erotate/sphere
 variable        etotal equal "ke + c_erot + pe" # thermo etotal doesn't include erot
 thermo_style    custom step temp ke c_erot pe v_etotal
 thermo          500
 thermo_modify   norm no
 #dump           1 all custom 500 dump.dipole id x y z diameter mux muy muz fx fy fz tqx tqy tqz
 run             10000
--- a/examples/LEPTON/log.13Jan25.efield-lepton.g++.1
+++ b/examples/LEPTON/log.13Jan25.efield-lepton.g++.1
@ -0,0 +1,115 @@
 LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-283-g742c869534-modified)
  using 1 OpenMP thread(s) per MPI task
 # Point dipoles in a 3d box with an external potential (ignoring dipolar interactions)
 units           lj
 atom_style      hybrid sphere dipole
 WARNING: Atom style hybrid defines both, per-type and per-atom masses; both must be set, but only per-atom masses will be used (src/atom_vec_hybrid.cpp:132)
 dimension       3
 boundary        s s s
 region          box block -2 2 -2 2 -2 2
 create_box      1 box
 Created orthogonal box = (-2 -2 -2) to (2 2 2)
  1 by 1 by 1 MPI processor grid
 create_atoms    1 random 100 12345 NULL
 Created 100 atoms
  using lattice units in orthogonal box = (-2.0004 -2.0004 -2.0004) to (2.0004 2.0004 2.0004)
  create_atoms CPU = 0.000 seconds
 # need both mass settings due to hybrid atom style
 mass            1 1.0
 set             group all mass 1.0
 Setting atom values ...
  100 settings made for mass
 set             group all diameter 0.1
 Setting atom values ...
  100 settings made for diameter
 set             group all dipole/random 98934 0.01
 Setting atom values ...
  100 settings made for dipole/random
 pair_style      none
 comm_modify     cutoff 3.0
 velocity        all create 0.0 87287 mom yes rot yes
 fix             1 all nve/sphere update dipole
 ###############################################################################################################
 ## Yukawa potential
 #fix            2 all efield/lepton "A*exp(-B*r)/r; r=abs(sqrt(x^2+y^2+z^2)); A = 0.1; B = 5" step 1e-8
 ## Gradually increasing uniform field
 #variable       E equal ramp(0,1)
 #fix            2 all efield/lepton "-v_E*(x+y+z)"
 ## Linear gradient field
 fix             2 all efield/lepton "-1/6*x^3" step 1e-6
 fix_modify      2 energy yes
 ###############################################################################################################
 timestep        1e-3
 compute         erot all erotate/sphere
 variable        etotal equal "ke + c_erot + pe" # thermo etotal doesn't include erot
 thermo_style    custom step temp ke c_erot pe v_etotal
 thermo          500
 thermo_modify   norm no
 #dump           1 all custom 500 dump.dipole id x y z diameter mux muy muz fx fy fz tqx tqy tqz
 run             10000
 WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2442)
 Per MPI rank memory allocation (min/avg/max) = 4.273 | 4.273 | 4.273 Mbytes
   Step          Temp          KinEng         c_erot         PotEng        v_etotal   
         0   0              0              0              0.036419797    0.036419797  
       500   3.7159175e-06  0.00055181374  0.44262618    -0.40675701     0.036420985  
      1000   1.2808438e-05  0.0019020531   0.24499116    -0.21047295     0.036420259  
      1500   2.8343769e-05  0.0042090498   0.26504485    -0.2328336      0.036420307  
      2000   4.8796894e-05  0.0072463388   0.30953526    -0.28036098     0.036420618  
      2500   7.8933715e-05  0.011721657    0.2015076     -0.17680909     0.036420173  
      3000   0.00011381678  0.016901791    0.31002163    -0.29050294     0.036420476  
      3500   0.00015650339  0.023240753    0.27837968    -0.26520001     0.036420418  
      4000   0.00020429109  0.030337227    0.26201101    -0.25592795     0.036420289  
      4500   0.00026362339  0.039148074    0.29769952    -0.3004271      0.036420499  
      5000   0.00033328941  0.049493478    0.21642442    -0.22949776     0.036420131  
      5500   0.00040914224  0.060757622    0.28422322    -0.30856047     0.036420377  
      6000   0.00049425119  0.073396302    0.31767       -0.35464572     0.03642058   
      6500   0.00058508892  0.086885704    0.29079532    -0.34126075     0.036420276  
      7000   0.00069845073  0.10371993     0.25776048    -0.32506015     0.036420262  
      7500   0.0008215656   0.12200249     0.27033777    -0.35591972     0.036420539  
      8000   0.00095528125  0.14185927     0.33943527    -0.44487406     0.036420479  
      8500   0.0011052502   0.16412965     0.26727165    -0.39498109     0.036420218  
      9000   0.0012738298   0.18916373     0.31082058    -0.46356382     0.036420485  
      9500   0.001464197    0.21743325     0.25669856    -0.43771158     0.036420224  
     10000   0.0016627654   0.24692067     0.36273185    -0.57323194     0.036420578  
 Loop time of 0.84714 on 1 procs for 10000 steps with 100 atoms
 Performance: 1019901.911 tau/day, 11804.420 timesteps/s, 1.180 Matom-step/s
 62.3% CPU use with 1 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
 Pair    | 0          | 0          | 0          |   0.0 |  0.00
 Neigh   | 9.21e-07   | 9.21e-07   | 9.21e-07   |   0.0 |  0.00
 Comm    | 0.00094138 | 0.00094138 | 0.00094138 |   0.0 |  0.11
 Output  | 0.0001983  | 0.0001983  | 0.0001983  |   0.0 |  0.02
 Modify  | 0.84105    | 0.84105    | 0.84105    |   0.0 | 99.28
 Other   |            | 0.004946   |            |       |  0.58
 Nlocal:            100 ave         100 max         100 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Nghost:              0 ave           0 max           0 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Neighs:              0 ave           0 max           0 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Total # of neighbors = 0
 Ave neighs/atom = 0
 Neighbor list builds = 5
 Dangerous builds = 0
 Total wall time: 0:00:00
--- a/examples/LEPTON/log.13Jan25.efield-lepton.g++.4
+++ b/examples/LEPTON/log.13Jan25.efield-lepton.g++.4
@ -0,0 +1,115 @@
 LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-283-g742c869534-modified)
  using 1 OpenMP thread(s) per MPI task
 # Point dipoles in a 3d box with an external potential (ignoring dipolar interactions)
 units           lj
 atom_style      hybrid sphere dipole
 WARNING: Atom style hybrid defines both, per-type and per-atom masses; both must be set, but only per-atom masses will be used (src/atom_vec_hybrid.cpp:132)
 dimension       3
 boundary        s s s
 region          box block -2 2 -2 2 -2 2
 create_box      1 box
 Created orthogonal box = (-2 -2 -2) to (2 2 2)
  1 by 2 by 2 MPI processor grid
 create_atoms    1 random 100 12345 NULL
 Created 100 atoms
  using lattice units in orthogonal box = (-2.0004 -2.0004 -2.0004) to (2.0004 2.0004 2.0004)
  create_atoms CPU = 0.000 seconds
 # need both mass settings due to hybrid atom style
 mass            1 1.0
 set             group all mass 1.0
 Setting atom values ...
  100 settings made for mass
 set             group all diameter 0.1
 Setting atom values ...
  100 settings made for diameter
 set             group all dipole/random 98934 0.01
 Setting atom values ...
  100 settings made for dipole/random
 pair_style      none
 comm_modify     cutoff 3.0
 velocity        all create 0.0 87287 mom yes rot yes
 fix             1 all nve/sphere update dipole
 ###############################################################################################################
 ## Yukawa potential
 #fix            2 all efield/lepton "A*exp(-B*r)/r; r=abs(sqrt(x^2+y^2+z^2)); A = 0.1; B = 5" step 1e-8
 ## Gradually increasing uniform field
 #variable       E equal ramp(0,1)
 #fix            2 all efield/lepton "-v_E*(x+y+z)"
 ## Linear gradient field
 fix             2 all efield/lepton "-1/6*x^3" step 1e-6
 fix_modify      2 energy yes
 ###############################################################################################################
 timestep        1e-3
 compute         erot all erotate/sphere
 variable        etotal equal "ke + c_erot + pe" # thermo etotal doesn't include erot
 thermo_style    custom step temp ke c_erot pe v_etotal
 thermo          500
 thermo_modify   norm no
 #dump           1 all custom 500 dump.dipole id x y z diameter mux muy muz fx fy fz tqx tqy tqz
 run             10000
 WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2442)
 Per MPI rank memory allocation (min/avg/max) = 4.289 | 4.289 | 4.289 Mbytes
   Step          Temp          KinEng         c_erot         PotEng        v_etotal   
         0   0              0              0              0.036419797    0.036419797  
       500   3.7159175e-06  0.00055181374  0.44262618    -0.40675701     0.036420985  
      1000   1.2808438e-05  0.0019020531   0.24499116    -0.21047295     0.036420259  
      1500   2.8343769e-05  0.0042090498   0.26504485    -0.2328336      0.036420307  
      2000   4.8796894e-05  0.0072463388   0.30953526    -0.28036098     0.036420618  
      2500   7.8933715e-05  0.011721657    0.2015076     -0.17680909     0.036420173  
      3000   0.00011381678  0.016901791    0.31002163    -0.29050294     0.036420476  
      3500   0.00015650339  0.023240753    0.27837968    -0.26520001     0.036420418  
      4000   0.00020429109  0.030337227    0.26201101    -0.25592795     0.036420289  
      4500   0.00026362339  0.039148074    0.29769952    -0.3004271      0.036420499  
      5000   0.00033328941  0.049493478    0.21642442    -0.22949776     0.036420131  
      5500   0.00040914224  0.060757622    0.28422322    -0.30856047     0.036420377  
      6000   0.00049425119  0.073396302    0.31767       -0.35464572     0.03642058   
      6500   0.00058508892  0.086885704    0.29079532    -0.34126075     0.036420276  
      7000   0.00069845073  0.10371993     0.25776048    -0.32506015     0.036420262  
      7500   0.0008215656   0.12200249     0.27033777    -0.35591972     0.036420539  
      8000   0.00095528125  0.14185927     0.33943527    -0.44487406     0.036420479  
      8500   0.0011052502   0.16412965     0.26727165    -0.39498109     0.036420218  
      9000   0.0012738298   0.18916373     0.31082058    -0.46356382     0.036420485  
      9500   0.001464197    0.21743325     0.25669856    -0.43771158     0.036420224  
     10000   0.0016627654   0.24692067     0.36273185    -0.57323194     0.036420578  
 Loop time of 0.985035 on 4 procs for 10000 steps with 100 atoms
 Performance: 877125.838 tau/day, 10151.919 timesteps/s, 1.015 Matom-step/s
 67.7% CPU use with 4 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
 Pair    | 0          | 0          | 0          |   0.0 |  0.00
 Neigh   | 7.22e-07   | 8.9125e-07 | 1.031e-06  |   0.0 |  0.00
 Comm    | 0.09818    | 0.1024     | 0.10798    |   1.1 | 10.40
 Output  | 0.00021634 | 0.00028668 | 0.00044312 |   0.0 |  0.03
 Modify  | 0.773      | 0.81845    | 0.84055    |   3.0 | 83.09
 Other   |            | 0.06389    |            |       |  6.49
 Nlocal:             25 ave          30 max          23 min
 Histogram: 2 1 0 0 0 0 0 0 0 1
 Nghost:             75 ave          77 max          70 min
 Histogram: 1 0 0 0 0 0 0 0 1 2
 Neighs:              0 ave           0 max           0 min
 Histogram: 4 0 0 0 0 0 0 0 0 0
 Total # of neighbors = 0
 Ave neighs/atom = 0
 Neighbor list builds = 5
 Dangerous builds = 0
 Total wall time: 0:00:00
--- a/examples/PACKAGES/dispersion/README
+++ b/examples/PACKAGES/dispersion/README
@ -0,0 +1,6 @@
 To run these examples, one needs to compile LAMMPS with the ML-PACE (-DPKG_ML-PACE=ON)  and the EXTRA-PAIR packages (-DPKG_EXTRA-PAIR=ON).
 These examples show how to combine a short-ranged ML potential with a dispersion correction scheme. Here we combine a general-purpose ACE potential for carbon (10.1021/acs.jctc.2c01149), with two different dispersion correction schemes:
  - D2 (a pure two-body potential, here tabulated)
  - D3 (a many-body potential, implemented in LAMMPS).
--- a/examples/PACKAGES/dispersion/in.d2
+++ b/examples/PACKAGES/dispersion/in.d2
@ -0,0 +1,36 @@
 atom_style              atomic
 units                   metal
 boundary                p p p
 atom_modify             sort 0 0.0
 lattice                 sc 1.0
 region                  box block 0 10 0 10 0 10
 create_box              1 box
 create_atoms            1 region box
 variable                l equal 47.6
 change_box              all x final 0 $l y final 0 $l z final 0 $l remap
 region                  world block INF INF  INF INF  INF INF
 ### interactions
 pair_style              hybrid/overlay pace table linear 10000
 pair_coeff              * * pace potential_files/c_ace.yace C
 pair_coeff              * * table potential_files/d2.table D2 9.0
 mass                    1 12.011000
 velocity                all create 200 1234
 compute                 c1 all pair pace
 compute                 c2 all pair table
 # calculate the e/atom for each pair style individually
 variable                dUpace equal c_c1/atoms
 variable                dUd2 equal c_c2/atoms
 ### run
 timestep                0.001
 fix                     1 all nvt temp 200.0 200.0 0.01
 thermo_style            custom step temp pe press etotal v_dUpace v_dUd2
 thermo                  10
 run                     100
--- a/examples/PACKAGES/dispersion/in.d3
+++ b/examples/PACKAGES/dispersion/in.d3
@ -0,0 +1,36 @@
 atom_style              atomic
 units                   metal
 boundary                p p p
 atom_modify             sort 0 0.0
 lattice                 sc 1.0
 region                  box block 0 10 0 10 0 10
 create_box              1 box
 create_atoms            1 region box
 variable                l equal 47.6
 change_box              all x final 0 $l y final 0 $l z final 0 $l remap
 region                  world block INF INF  INF INF  INF INF
 ### interactions
 pair_style              hybrid/overlay pace dispersion/d3 bj pbe 16.0 16.0
 pair_coeff              * * pace potential_files/c_ace.yace C
 pair_coeff              * * dispersion/d3 C
 mass                    1 12.011000
 velocity                all create 200 1234
 compute                 c1 all pair pace
 compute                 c2 all pair dispersion/d3
 # calculate the e/atom for each pair style individually
 variable                Upace equal c_c1/atoms
 variable                Ud3 equal c_c2/atoms
 ### run
 timestep                0.001
 fix                     1 all nvt temp 200.0 200.0 0.01
 thermo_style            custom step temp pe press etotal v_Upace v_Ud3
 thermo                  10
 run                     100
--- a/examples/PACKAGES/dispersion/log.20Dec2024.d2.g++.1
+++ b/examples/PACKAGES/dispersion/log.20Dec2024.d2.g++.1
@ -0,0 +1,122 @@
 LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-125-g095d33dafb)
 OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
  using 1 OpenMP thread(s) per MPI task
 atom_style              atomic
 units                   metal
 boundary                p p p
 atom_modify             sort 0 0.0
 lattice                 sc 1.0
 Lattice spacing in x,y,z = 1 1 1
 region                  box block 0 10 0 10 0 10
 create_box              1 box
 Created orthogonal box = (0 0 0) to (10 10 10)
  1 by 1 by 1 MPI processor grid
 create_atoms            1 region box
 Created 1000 atoms
  using lattice units in orthogonal box = (0 0 0) to (10 10 10)
  create_atoms CPU = 0.000 seconds
 variable                l equal 47.6
 change_box              all x final 0 $l y final 0 $l z final 0 $l remap
 change_box              all x final 0 47.6 y final 0 $l z final 0 $l remap
 change_box              all x final 0 47.6 y final 0 47.6 z final 0 $l remap
 change_box              all x final 0 47.6 y final 0 47.6 z final 0 47.6 remap
 Changing box ...
  orthogonal box = (0 0 0) to (47.6 10 10)
  orthogonal box = (0 0 0) to (47.6 47.6 10)
  orthogonal box = (0 0 0) to (47.6 47.6 47.6)
 region                  world block INF INF  INF INF  INF INF
 ### interactions
 pair_style              hybrid/overlay pace table linear 10000
 ACE version: 2023.11.25
 Recursive evaluator is used
 pair_coeff              * * pace potential_files/c_ace.yace C
 Loading potential_files/c_ace.yace
 Total number of basis functions
 	C: 20 (r=1) 455 (r>1)
 Mapping LAMMPS atom type #1(C) -> ACE species type #0
 pair_coeff              * * table potential_files/d2.table D2 9.0
 Reading pair table potential file potential_files/d2.table with DATE: 2021-12-16
 WARNING: 8063 of 20000 force values in table D2 are inconsistent with -dE/dr.
 WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:466)
 WARNING: 2386 of 20000 distance values in table 1e-06 with relative error
 WARNING:  over D2 to re-computed values (src/pair_table.cpp:474)
 mass                    1 12.011000
 velocity                all create 200 1234
 compute                 c1 all pair pace
 compute                 c2 all pair table
 # calculate the e/atom for each pair style individually
 variable                dUpace equal c_c1/atoms
 variable                dUd2 equal c_c2/atoms
 ### run
 timestep                0.001
 fix                     1 all nvt temp 200.0 200.0 0.01
 thermo_style            custom step temp pe press etotal v_dUpace v_dUd2
 thermo                  10
 run                     100
 Neighbor list info ...
  update: every = 1 steps, delay = 0 steps, check = yes
  max neighbors/atom: 2000, page size: 100000
  master list distance cutoff = 11
  ghost atom cutoff = 11
  binsize = 5.5, bins = 9 9 9
  2 neighbor lists, perpetual/occasional/extra = 2 0 0
  (1) pair pace, perpetual
      attributes: full, newton on, cut 7.5
      pair build: full/bin/atomonly
      stencil: full/bin/3d
      bin: standard
  (2) pair table, perpetual
      attributes: half, newton on
      pair build: half/bin/atomonly/newton
      stencil: half/bin/3d
      bin: standard
 Per MPI rank memory allocation (min/avg/max) = 3.735 | 3.735 | 3.735 Mbytes
   Step          Temp          PotEng         Press          TotEng        v_dUpace        v_dUd2    
         0   200           -262.26589     -9971.6713     -236.43971     -0.2577066     -0.0045592958 
        10   198.01563     -261.95164     -9936.5218     -236.38171     -0.25738489    -0.004566747  
        20   199.80384     -261.06484     -9826.0969     -235.26399     -0.25647577    -0.0045890709 
        30   200.79867     -259.7549      -9655.8924     -233.82559     -0.25512792    -0.0046269853 
        40   194.7303      -258.36397     -9450.9508     -233.21827     -0.25368377    -0.004680203  
        50   197.08802     -257.40377     -9200.5727     -231.95362     -0.25265301    -0.0047507608 
        60   204.21755     -257.66495     -8919.2309     -231.29416     -0.25282305    -0.0048419012 
        70   216.81983     -260.19034     -8702.5441     -232.19221     -0.25523198    -0.0049583602 
        80   242.71952     -266.40641     -8617.9868     -235.06383     -0.26129243    -0.0051139831 
        90   294.45869     -279.46195     -8724.2954     -241.43824     -0.27411961    -0.0053423377 
       100   400.44323     -307.29577     -9070.6387     -255.58618     -0.30165815    -0.0056376175 
 Loop time of 2.66184 on 1 procs for 100 steps with 1000 atoms
 Performance: 3.246 ns/day, 7.394 hours/ns, 37.568 timesteps/s, 37.568 katom-step/s
 99.6% CPU use with 1 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
 Pair    | 2.6584     | 2.6584     | 2.6584     |   0.0 | 99.87
 Neigh   | 0.0012861  | 0.0012861  | 0.0012861  |   0.0 |  0.05
 Comm    | 0.00064617 | 0.00064617 | 0.00064617 |   0.0 |  0.02
 Output  | 0.00024173 | 0.00024173 | 0.00024173 |   0.0 |  0.01
 Modify  | 0.00099328 | 0.00099328 | 0.00099328 |   0.0 |  0.04
 Other   |            | 0.0002431  |            |       |  0.01
 Nlocal:           1000 ave        1000 max        1000 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Nghost:           2375 ave        2375 max        2375 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Neighs:          26027 ave       26027 max       26027 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 FullNghs:        17736 ave       17736 max       17736 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Total # of neighbors = 26027
 Ave neighs/atom = 26.027
 Neighbor list builds = 1
 Dangerous builds = 0
 Total wall time: 0:00:02
--- a/examples/PACKAGES/dispersion/log.20Dec2024.d2.g++.4
+++ b/examples/PACKAGES/dispersion/log.20Dec2024.d2.g++.4
@ -0,0 +1,122 @@
 LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-125-g095d33dafb)
 OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
  using 1 OpenMP thread(s) per MPI task
 atom_style              atomic
 units                   metal
 boundary                p p p
 atom_modify             sort 0 0.0
 lattice                 sc 1.0
 Lattice spacing in x,y,z = 1 1 1
 region                  box block 0 10 0 10 0 10
 create_box              1 box
 Created orthogonal box = (0 0 0) to (10 10 10)
  1 by 2 by 2 MPI processor grid
 create_atoms            1 region box
 Created 1000 atoms
  using lattice units in orthogonal box = (0 0 0) to (10 10 10)
  create_atoms CPU = 0.000 seconds
 variable                l equal 47.6
 change_box              all x final 0 $l y final 0 $l z final 0 $l remap
 change_box              all x final 0 47.6 y final 0 $l z final 0 $l remap
 change_box              all x final 0 47.6 y final 0 47.6 z final 0 $l remap
 change_box              all x final 0 47.6 y final 0 47.6 z final 0 47.6 remap
 Changing box ...
  orthogonal box = (0 0 0) to (47.6 10 10)
  orthogonal box = (0 0 0) to (47.6 47.6 10)
  orthogonal box = (0 0 0) to (47.6 47.6 47.6)
 region                  world block INF INF  INF INF  INF INF
 ### interactions
 pair_style              hybrid/overlay pace table linear 10000
 ACE version: 2023.11.25
 Recursive evaluator is used
 pair_coeff              * * pace potential_files/c_ace.yace C
 Loading potential_files/c_ace.yace
 Total number of basis functions
 	C: 20 (r=1) 455 (r>1)
 Mapping LAMMPS atom type #1(C) -> ACE species type #0
 pair_coeff              * * table potential_files/d2.table D2 9.0
 Reading pair table potential file potential_files/d2.table with DATE: 2021-12-16
 WARNING: 8063 of 20000 force values in table D2 are inconsistent with -dE/dr.
 WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:466)
 WARNING: 2386 of 20000 distance values in table 1e-06 with relative error
 WARNING:  over D2 to re-computed values (src/pair_table.cpp:474)
 mass                    1 12.011000
 velocity                all create 200 1234
 compute                 c1 all pair pace
 compute                 c2 all pair table
 # calculate the e/atom for each pair style individually
 variable                dUpace equal c_c1/atoms
 variable                dUd2 equal c_c2/atoms
 ### run
 timestep                0.001
 fix                     1 all nvt temp 200.0 200.0 0.01
 thermo_style            custom step temp pe press etotal v_dUpace v_dUd2
 thermo                  10
 run                     100
 Neighbor list info ...
  update: every = 1 steps, delay = 0 steps, check = yes
  max neighbors/atom: 2000, page size: 100000
  master list distance cutoff = 11
  ghost atom cutoff = 11
  binsize = 5.5, bins = 9 9 9
  2 neighbor lists, perpetual/occasional/extra = 2 0 0
  (1) pair pace, perpetual
      attributes: full, newton on, cut 7.5
      pair build: full/bin/atomonly
      stencil: full/bin/3d
      bin: standard
  (2) pair table, perpetual
      attributes: half, newton on
      pair build: half/bin/atomonly/newton
      stencil: half/bin/3d
      bin: standard
 Per MPI rank memory allocation (min/avg/max) = 3.655 | 3.655 | 3.655 Mbytes
   Step          Temp          PotEng         Press          TotEng        v_dUpace        v_dUd2    
         0   200           -262.26589     -9971.6713     -236.43971     -0.2577066     -0.0045592958 
        10   198.00622     -261.95011     -9934.5046     -236.38139     -0.25738304    -0.0045670733 
        20   199.81545     -261.06219     -9818.4051     -235.25985     -0.25647183    -0.0045903655 
        30   200.85902     -259.76256     -9639.9086     -233.82546     -0.25513263    -0.0046299265 
        40   195.00229     -258.4153      -9425.3772     -233.23448     -0.25372979    -0.0046855071 
        50   198.00573     -257.57066     -9164.7658     -232.00201     -0.25281159    -0.0047590772 
        60   206.26759     -258.09159     -8877.0162     -231.45607     -0.25323684    -0.0048547477 
        70   219.81939     -261.10607     -8668.5789     -232.7206      -0.25612771    -0.0049783595 
        80   250.27428     -268.27862     -8601.1343     -235.96048     -0.2631332     -0.0051454143 
        90   308.88167     -283.24793     -8745.8792     -243.36177     -0.27785093    -0.0053969977 
       100   427.60692     -315.05776     -9147.2389     -259.8405      -0.30933434    -0.0057234269 
 Loop time of 0.69628 on 4 procs for 100 steps with 1000 atoms
 Performance: 12.409 ns/day, 1.934 hours/ns, 143.620 timesteps/s, 143.620 katom-step/s
 99.5% CPU use with 4 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
 Pair    | 0.67839    | 0.68307    | 0.69054    |   0.6 | 98.10
 Neigh   | 0.00034181 | 0.00034811 | 0.00036188 |   0.0 |  0.05
 Comm    | 0.0045334  | 0.012031   | 0.016704   |   4.4 |  1.73
 Output  | 0.00015123 | 0.00017175 | 0.0002318  |   0.0 |  0.02
 Modify  | 0.00041346 | 0.00043062 | 0.00044327 |   0.0 |  0.06
 Other   |            | 0.0002301  |            |       |  0.03
 Nlocal:            250 ave         261 max         246 min
 Histogram: 3 0 0 0 0 0 0 0 0 1
 Nghost:           1250 ave        1254 max        1239 min
 Histogram: 1 0 0 0 0 0 0 0 0 3
 Neighs:           6501 ave        6778 max        6320 min
 Histogram: 1 0 2 0 0 0 0 0 0 1
 FullNghs:       4421.5 ave        4595 max        4332 min
 Histogram: 1 2 0 0 0 0 0 0 0 1
 Total # of neighbors = 26004
 Ave neighs/atom = 26.004
 Neighbor list builds = 1
 Dangerous builds = 0
 Total wall time: 0:00:00
--- a/examples/PACKAGES/dispersion/log.20Dec2024.d3.g++.1
+++ b/examples/PACKAGES/dispersion/log.20Dec2024.d3.g++.1
@ -0,0 +1,117 @@
 LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-125-g095d33dafb)
 OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
  using 1 OpenMP thread(s) per MPI task
 atom_style              atomic
 units                   metal
 boundary                p p p
 atom_modify             sort 0 0.0
 lattice                 sc 1.0
 Lattice spacing in x,y,z = 1 1 1
 region                  box block 0 10 0 10 0 10
 create_box              1 box
 Created orthogonal box = (0 0 0) to (10 10 10)
  1 by 1 by 1 MPI processor grid
 create_atoms            1 region box
 Created 1000 atoms
  using lattice units in orthogonal box = (0 0 0) to (10 10 10)
  create_atoms CPU = 0.000 seconds
 variable                l equal 47.6
 change_box              all x final 0 $l y final 0 $l z final 0 $l remap
 change_box              all x final 0 47.6 y final 0 $l z final 0 $l remap
 change_box              all x final 0 47.6 y final 0 47.6 z final 0 $l remap
 change_box              all x final 0 47.6 y final 0 47.6 z final 0 47.6 remap
 Changing box ...
  orthogonal box = (0 0 0) to (47.6 10 10)
  orthogonal box = (0 0 0) to (47.6 47.6 10)
  orthogonal box = (0 0 0) to (47.6 47.6 47.6)
 region                  world block INF INF  INF INF  INF INF
 ### interactions
 pair_style              hybrid/overlay pace dispersion/d3 bj pbe 16.0 16.0
 ACE version: 2023.11.25
 Recursive evaluator is used
 pair_coeff              * * pace potential_files/c_ace.yace C
 Loading potential_files/c_ace.yace
 Total number of basis functions
 	C: 20 (r=1) 455 (r>1)
 Mapping LAMMPS atom type #1(C) -> ACE species type #0
 pair_coeff              * * dispersion/d3 C
 mass                    1 12.011000
 velocity                all create 200 1234
 compute                 c1 all pair pace
 compute                 c2 all pair dispersion/d3
 # calculate the e/atom for each pair style individually
 variable                Upace equal c_c1/atoms
 variable                Ud3 equal c_c2/atoms
 ### run
 timestep                0.001
 fix                     1 all nvt temp 200.0 200.0 0.01
 thermo_style            custom step temp pe press etotal v_Upace v_Ud3
 thermo                  10
 run                     100
 Neighbor list info ...
  update: every = 1 steps, delay = 0 steps, check = yes
  max neighbors/atom: 2000, page size: 100000
  master list distance cutoff = 18
  ghost atom cutoff = 18
  binsize = 9, bins = 6 6 6
  2 neighbor lists, perpetual/occasional/extra = 2 0 0
  (1) pair pace, perpetual
      attributes: full, newton on, cut 7.5
      pair build: full/bin/atomonly
      stencil: full/bin/3d
      bin: standard
  (2) pair dispersion/d3, perpetual
      attributes: half, newton on
      pair build: half/bin/atomonly/newton
      stencil: half/bin/3d
      bin: standard
 Per MPI rank memory allocation (min/avg/max) = 4.225 | 4.225 | 4.225 Mbytes
   Step          Temp          PotEng         Press          TotEng        v_Upace         v_Ud3     
         0   200           -269.22784     -10163.81      -243.40166     -0.2577066     -0.011521241  
        10   198.05578     -268.91992     -10128.61      -243.34481     -0.25738487    -0.01153505   
        20   199.85092     -268.05146     -10018.116     -242.24454     -0.25647561    -0.011575851  
        30   201.10902     -266.77119     -9847.2946     -240.80181     -0.2551274     -0.011643795  
        40   195.0686      -265.42225     -9641.6992     -240.23287     -0.25368339    -0.011738855  
        50   197.63706     -264.51951     -9390.1455     -238.99847     -0.25265765    -0.011861864  
        60   205.01072     -264.86268     -9107.4427     -238.38947     -0.25284579    -0.012016888  
        70   217.51797     -267.50863     -8890.9916     -239.42034     -0.25529813    -0.012210496  
        80   244.30754     -273.91051     -8806.154      -242.36286     -0.26145652    -0.01245399   
        90   296.72041     -287.2518      -8913.8963     -248.93603     -0.27448382    -0.012767981  
       100   404.07337     -315.6103      -9266.1292     -263.43195     -0.3024416     -0.013168694  
 Loop time of 4.52709 on 1 procs for 100 steps with 1000 atoms
 Performance: 1.909 ns/day, 12.575 hours/ns, 22.089 timesteps/s, 22.089 katom-step/s
 99.6% CPU use with 1 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
 Pair    | 4.5223     | 4.5223     | 4.5223     |   0.0 | 99.89
 Neigh   | 0.0023631  | 0.0023631  | 0.0023631  |   0.0 |  0.05
 Comm    | 0.00088624 | 0.00088624 | 0.00088624 |   0.0 |  0.02
 Output  | 0.00027759 | 0.00027759 | 0.00027759 |   0.0 |  0.01
 Modify  | 0.0010211  | 0.0010211  | 0.0010211  |   0.0 |  0.02
 Other   |            | 0.0002737  |            |       |  0.01
 Nlocal:           1000 ave        1000 max        1000 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Nghost:           3913 ave        3913 max        3913 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Neighs:         116409 ave      116409 max      116409 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 FullNghs:        17748 ave       17748 max       17748 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Total # of neighbors = 116409
 Ave neighs/atom = 116.409
 Neighbor list builds = 1
 Dangerous builds = 0
 Total wall time: 0:00:04
--- a/examples/PACKAGES/dispersion/log.20Dec2024.d3.g++.4
+++ b/examples/PACKAGES/dispersion/log.20Dec2024.d3.g++.4
@ -0,0 +1,117 @@
 LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-125-g095d33dafb)
 OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
  using 1 OpenMP thread(s) per MPI task
 atom_style              atomic
 units                   metal
 boundary                p p p
 atom_modify             sort 0 0.0
 lattice                 sc 1.0
 Lattice spacing in x,y,z = 1 1 1
 region                  box block 0 10 0 10 0 10
 create_box              1 box
 Created orthogonal box = (0 0 0) to (10 10 10)
  1 by 2 by 2 MPI processor grid
 create_atoms            1 region box
 Created 1000 atoms
  using lattice units in orthogonal box = (0 0 0) to (10 10 10)
  create_atoms CPU = 0.000 seconds
 variable                l equal 47.6
 change_box              all x final 0 $l y final 0 $l z final 0 $l remap
 change_box              all x final 0 47.6 y final 0 $l z final 0 $l remap
 change_box              all x final 0 47.6 y final 0 47.6 z final 0 $l remap
 change_box              all x final 0 47.6 y final 0 47.6 z final 0 47.6 remap
 Changing box ...
  orthogonal box = (0 0 0) to (47.6 10 10)
  orthogonal box = (0 0 0) to (47.6 47.6 10)
  orthogonal box = (0 0 0) to (47.6 47.6 47.6)
 region                  world block INF INF  INF INF  INF INF
 ### interactions
 pair_style              hybrid/overlay pace dispersion/d3 bj pbe 16.0 16.0
 ACE version: 2023.11.25
 Recursive evaluator is used
 pair_coeff              * * pace potential_files/c_ace.yace C
 Loading potential_files/c_ace.yace
 Total number of basis functions
 	C: 20 (r=1) 455 (r>1)
 Mapping LAMMPS atom type #1(C) -> ACE species type #0
 pair_coeff              * * dispersion/d3 C
 mass                    1 12.011000
 velocity                all create 200 1234
 compute                 c1 all pair pace
 compute                 c2 all pair dispersion/d3
 # calculate the e/atom for each pair style individually
 variable                Upace equal c_c1/atoms
 variable                Ud3 equal c_c2/atoms
 ### run
 timestep                0.001
 fix                     1 all nvt temp 200.0 200.0 0.01
 thermo_style            custom step temp pe press etotal v_Upace v_Ud3
 thermo                  10
 run                     100
 Neighbor list info ...
  update: every = 1 steps, delay = 0 steps, check = yes
  max neighbors/atom: 2000, page size: 100000
  master list distance cutoff = 18
  ghost atom cutoff = 18
  binsize = 9, bins = 6 6 6
  2 neighbor lists, perpetual/occasional/extra = 2 0 0
  (1) pair pace, perpetual
      attributes: full, newton on, cut 7.5
      pair build: full/bin/atomonly
      stencil: full/bin/3d
      bin: standard
  (2) pair dispersion/d3, perpetual
      attributes: half, newton on
      pair build: half/bin/atomonly/newton
      stencil: half/bin/3d
      bin: standard
 Per MPI rank memory allocation (min/avg/max) = 3.732 | 3.732 | 3.732 Mbytes
   Step          Temp          PotEng         Press          TotEng        v_Upace         v_Ud3     
         0   200           -269.22784     -10163.81      -243.40166     -0.2577066     -0.011521241  
        10   198.04813     -268.91867     -10126.59      -243.34454     -0.25738301    -0.011535654  
        20   199.86491     -268.04994     -10010.421     -242.24121     -0.25647167    -0.011578276  
        30   201.18317     -266.78129     -9831.2837     -240.80233     -0.25513213    -0.011649162  
        40   195.35281     -265.47802     -9616.0833     -240.25194     -0.2537296     -0.011748422  
        50   198.56247     -264.69401     -9354.3017     -239.05347     -0.25281709    -0.011876925  
        60   207.17238     -265.30194     -9065.1196     -238.54959     -0.25326251    -0.012039431  
        70   221.05245     -268.44583     -8856.3622     -239.90114     -0.25620278    -0.012243053  
        80   252.00942     -275.82142     -8789.4126     -243.27922     -0.26332044    -0.012500977  
        90   311.21153     -291.09334     -8935.4036     -250.90632     -0.27825852    -0.012834817  
       100   431.24438     -323.45003     -9344.1963     -267.76306     -0.31019084    -0.013259185  
 Loop time of 1.20684 on 4 procs for 100 steps with 1000 atoms
 Performance: 7.159 ns/day, 3.352 hours/ns, 82.861 timesteps/s, 82.861 katom-step/s
 99.4% CPU use with 4 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
 Pair    | 1.2007     | 1.2019     | 1.2029     |   0.1 | 99.60
 Neigh   | 0.00060541 | 0.00062493 | 0.00064411 |   0.0 |  0.05
 Comm    | 0.0024344  | 0.0033552  | 0.0045996  |   1.4 |  0.28
 Output  | 0.00016956 | 0.00017999 | 0.00021054 |   0.0 |  0.01
 Modify  | 0.00046946 | 0.00048235 | 0.00049796 |   0.0 |  0.04
 Other   |            | 0.0002449  |            |       |  0.02
 Nlocal:            250 ave         261 max         246 min
 Histogram: 3 0 0 0 0 0 0 0 0 1
 Nghost:           2198 ave        2202 max        2187 min
 Histogram: 1 0 0 0 0 0 0 0 0 3
 Neighs:        29023.2 ave       29681 max       27646 min
 Histogram: 1 0 0 0 0 0 0 1 0 2
 FullNghs:         4421 ave        4595 max        4331 min
 Histogram: 1 2 0 0 0 0 0 0 0 1
 Total # of neighbors = 116093
 Ave neighs/atom = 116.093
 Neighbor list builds = 1
 Dangerous builds = 0
 Total wall time: 0:00:01
--- a/examples/PACKAGES/dispersion/potential_files/c_ace.yace
+++ b/examples/PACKAGES/dispersion/potential_files/c_ace.yace
--- a/examples/PACKAGES/dispersion/potential_files/d2.table
+++ b/examples/PACKAGES/dispersion/potential_files/d2.table
--- a/examples/PACKAGES/imd/deca-ala-solv_imd_v3.py
+++ b/examples/PACKAGES/imd/deca-ala-solv_imd_v3.py
@ -0,0 +1,13 @@
 """
 For use with 'in.deca-ala-solv_imd_v3'.
 Tested with imdclient v0.1.4 and MDAnalysis v2.8.0
 """
 from imdclient.IMD import IMDReader
 import MDAnalysis as mda
 u = mda.Universe('data.deca-ala-solv', "imd://localhost:5678", topology_format='DATA')
 for ts in u.trajectory:
    print(ts.time)
    print(ts.velocities)
--- a/examples/PACKAGES/imd/in.deca-ala-solv_imd_v3
+++ b/examples/PACKAGES/imd/in.deca-ala-solv_imd_v3
@ -0,0 +1,31 @@
 # 
 units           real
 neighbor        2.5 bin
 neigh_modify    delay 1 every 1 
 atom_style      full
 bond_style      harmonic
 angle_style     charmm
 dihedral_style  charmm
 improper_style  harmonic
 pair_style      lj/charmm/coul/long 8 10
 pair_modify     mix arithmetic
 special_bonds   charmm
 read_data       data.deca-ala-solv
 group peptide   id <= 103
 fix             rigidh all shake 1e-6 100 1000 t 1 2 3 4 5 a 23
 thermo          100
 thermo_style    multi
 timestep        2.0
 kspace_style    pppm 1e-5
 fix             ensemble all npt temp 300.0 300.0 100.0 iso 1.0 1.0 1000.0 drag 0.2
 # IMD setup. Client code available in 'deca-ala-solv_imd_v3.py'
 fix  comm       all imd 5678 unwrap on trate 10 version 3 time on box on coordinates on velocities on forces off 
 run 5000000
--- a/examples/snap/README.md
+++ b/examples/snap/README.md
@ -9,5 +9,11 @@ in.snap.Mo_Chen                   # SNAP linear Mo potential
 in.snap.compute                   # SNAP compute for training a linear model
 in.snap.compute.quadratic         # SNAP compute for training a quadratic model
 in.snap.scale.Ni_Zuo_JCPA2020     # SNAP linear Ni potential with thermodynamic integration (fix adapt scale)
 in.C_SNAP                         # SNAP carbon potential
 compute_snap_dgrad.py             # SNAP compute with dgradflag (dBi/dRj) for training a non-linear model
 in.snap.grid                      # SNAP descriptors on a grid
 in.snap.grid.triclinic            # SNAP descriptors on a grid, triclinic
 in.gaussian.grid                  # Gaussian descriptors on a grid
--- a/examples/snap/in.gaussian.grid
+++ b/examples/snap/in.gaussian.grid
@ -0,0 +1,68 @@
 # Demonstrate calculation of Gaussian descriptors on a grid
 # for a cell with two atoms of type 1 and type 2.
 # The output in dump.glocal shows that for grid points
 # sitting on an atom of type 1 or 2:
 # val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
 # val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
 # These values are extracted to the log file
 # 
 variable 	nrep index 1
 variable 	a index 3.316
 variable 	ngrid index 2
 units		metal
 atom_modify	map hash
 # generate the box and atom positions using a BCC lattice
 variable       	nx equal ${nrep}
 variable 	ny equal ${nrep}
 variable 	nz equal ${nrep}
 boundary	p p p
 lattice		custom $a &
 		a1 1 0 0 &
 		a2 0 1 0  &
 		a3 0 0 1 &
 		basis 0 0 0 &
 		basis 0.5 0.5 0.5 &
 region		box block 0 ${nx} 0 ${ny} 0 ${nz}
 create_box	2 box
 create_atoms	1 box basis 1 1 basis 2 2
 mass 		* 180.88
 # define atom compute and grid compute
 variable 	rcutfac equal 4.67637
 variable 	radelem1 equal 0.5
 variable 	radelem2 equal 0.5
 variable	sigmaelem1 equal 0.1355
 variable	sigmaelem2 equal 0.2
 variable 	gaussian_options string &
 		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
 # build zero potential to force ghost atom creation
 pair_style      zero ${rcutfac}
 pair_coeff      * *
 # define atom and grid computes
 compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} &
 	 	${gaussian_options}
 # define output
 dump		1 all local 1000 dump.glocal c_mygridlocal[*]
 dump 		2 all custom 1000 dump.gatom id x y z
 compute		val1 all reduce max c_mygridlocal[7] inputs local
 compute		val2 all reduce max c_mygridlocal[8] inputs local
 thermo_style	custom step c_val1 c_val2
 # run
 run		0
--- a/examples/snap/in.snap.grid
+++ b/examples/snap/in.snap.grid
--- a/examples/snap/in.snap.grid.triclinic
+++ b/examples/snap/in.snap.grid.triclinic
@ -47,7 +47,6 @@ lattice		custom $a &
 		basis 0.0 0.0 0.5 &
 		spacing 1 1 1
 box 		tilt large
 region		box prism 0 ${nx} 0 ${ny} 0 ${nz} ${ny} ${nz} ${nz}
 create_box	1 box
 create_atoms	1 box
--- a/examples/snap/log.10Dec24.gaussian.grid.g++.1
+++ b/examples/snap/log.10Dec24.gaussian.grid.g++.1
@ -0,0 +1,129 @@
 LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-59-g16e0a7788a)
 OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
  using 1 OpenMP thread(s) per MPI task
 # Demonstrate calculation of Gaussian descriptors on a grid
 # for a cell with two atoms of type 1 and type 2.
 # The output in dump.glocal shows that for grid points
 # sitting on an atom of type 1 or 2:
 # val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
 # val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
 # These values are extracted to the log file
 #
 variable 	nrep index 1
 variable 	a index 3.316
 variable 	ngrid index 2
 units		metal
 atom_modify	map hash
 # generate the box and atom positions using a BCC lattice
 variable       	nx equal ${nrep}
 variable       	nx equal 1
 variable 	ny equal ${nrep}
 variable 	ny equal 1
 variable 	nz equal ${nrep}
 variable 	nz equal 1
 boundary	p p p
 lattice		custom $a 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
 lattice		custom 3.316 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
 Lattice spacing in x,y,z = 3.316 3.316 3.316
 region		box block 0 ${nx} 0 ${ny} 0 ${nz}
 region		box block 0 1 0 ${ny} 0 ${nz}
 region		box block 0 1 0 1 0 ${nz}
 region		box block 0 1 0 1 0 1
 create_box	2 box
 Created orthogonal box = (0 0 0) to (3.316 3.316 3.316)
  1 by 1 by 1 MPI processor grid
 create_atoms	1 box basis 1 1 basis 2 2
 Created 2 atoms
  using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316)
  create_atoms CPU = 0.001 seconds
 mass 		* 180.88
 # define atom compute and grid compute
 variable 	rcutfac equal 4.67637
 variable 	radelem1 equal 0.5
 variable 	radelem2 equal 0.5
 variable	sigmaelem1 equal 0.1355
 variable	sigmaelem2 equal 0.2
 variable 	gaussian_options string 		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
 4.67637 ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}
 4.67637 0.5 ${radelem2} ${sigmaelem1} ${sigmaelem2}
 4.67637 0.5 0.5 ${sigmaelem1} ${sigmaelem2}
 4.67637 0.5 0.5 0.1355 ${sigmaelem2}
 4.67637 0.5 0.5 0.1355 0.2
 # build zero potential to force ghost atom creation
 pair_style      zero ${rcutfac}
 pair_style      zero 4.67637
 pair_coeff      * *
 # define atom and grid computes
 compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} 	 	${gaussian_options}
 compute 	mygridlocal all gaussian/grid/local grid 2 ${ngrid} ${ngrid} 	 	${gaussian_options}
 compute 	mygridlocal all gaussian/grid/local grid 2 2 ${ngrid} 	 	${gaussian_options}
 compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	${gaussian_options}
 compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	4.67637 0.5 0.5 0.1355 0.2
 # define output
 dump		1 all local 1000 dump.glocal c_mygridlocal[*]
 dump 		2 all custom 1000 dump.gatom id x y z
 compute		val1 all reduce max c_mygridlocal[7] inputs local
 compute		val2 all reduce max c_mygridlocal[8] inputs local
 thermo_style	custom step c_val1 c_val2
 # run
 run		0
 WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
 Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
 Neighbor list info ...
  update: every = 1 steps, delay = 0 steps, check = yes
  max neighbors/atom: 2000, page size: 100000
  master list distance cutoff = 6.67637
  ghost atom cutoff = 6.67637
  binsize = 3.338185, bins = 1 1 1
  1 neighbor lists, perpetual/occasional/extra = 1 0 0
  (1) pair zero, perpetual
      attributes: half, newton on
      pair build: half/bin/atomonly/newton
      stencil: half/bin/3d
      bin: standard
 Per MPI rank memory allocation (min/avg/max) = 3.492 | 3.492 | 3.492 Mbytes
   Step         c_val1         c_val2    
         0   25.521859      7.9367045    
 Loop time of 1.088e-06 on 1 procs for 0 steps with 2 atoms
 183.8% CPU use with 1 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
 Pair    | 0          | 0          | 0          |   0.0 |  0.00
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
 Comm    | 0          | 0          | 0          |   0.0 |  0.00
 Output  | 0          | 0          | 0          |   0.0 |  0.00
 Modify  | 0          | 0          | 0          |   0.0 |  0.00
 Other   |            | 1.088e-06  |            |       |100.00
 Nlocal:              2 ave           2 max           2 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Nghost:            339 ave         339 max         339 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Neighs:             64 ave          64 max          64 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Total # of neighbors = 64
 Ave neighs/atom = 32
 Neighbor list builds = 0
 Dangerous builds = 0
 Total wall time: 0:00:00
--- a/examples/snap/log.10Dec24.gaussian.grid.g++.4
+++ b/examples/snap/log.10Dec24.gaussian.grid.g++.4
@ -0,0 +1,130 @@
 LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-59-g16e0a7788a)
 OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
  using 1 OpenMP thread(s) per MPI task
 # Demonstrate calculation of Gaussian descriptors on a grid
 # for a cell with two atoms of type 1 and type 2.
 # The output in dump.glocal shows that for grid points
 # sitting on an atom of type 1 or 2:
 # val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
 # val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
 # These values are extracted to the log file
 #
 variable 	nrep index 1
 variable 	a index 3.316
 variable 	ngrid index 2
 units		metal
 atom_modify	map hash
 # generate the box and atom positions using a BCC lattice
 variable       	nx equal ${nrep}
 variable       	nx equal 1
 variable 	ny equal ${nrep}
 variable 	ny equal 1
 variable 	nz equal ${nrep}
 variable 	nz equal 1
 boundary	p p p
 lattice		custom $a 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
 lattice		custom 3.316 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
 Lattice spacing in x,y,z = 3.316 3.316 3.316
 region		box block 0 ${nx} 0 ${ny} 0 ${nz}
 region		box block 0 1 0 ${ny} 0 ${nz}
 region		box block 0 1 0 1 0 ${nz}
 region		box block 0 1 0 1 0 1
 create_box	2 box
 Created orthogonal box = (0 0 0) to (3.316 3.316 3.316)
  1 by 2 by 2 MPI processor grid
 create_atoms	1 box basis 1 1 basis 2 2
 Created 2 atoms
  using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316)
  create_atoms CPU = 0.001 seconds
 mass 		* 180.88
 # define atom compute and grid compute
 variable 	rcutfac equal 4.67637
 variable 	radelem1 equal 0.5
 variable 	radelem2 equal 0.5
 variable	sigmaelem1 equal 0.1355
 variable	sigmaelem2 equal 0.2
 variable 	gaussian_options string 		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
 4.67637 ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}
 4.67637 0.5 ${radelem2} ${sigmaelem1} ${sigmaelem2}
 4.67637 0.5 0.5 ${sigmaelem1} ${sigmaelem2}
 4.67637 0.5 0.5 0.1355 ${sigmaelem2}
 4.67637 0.5 0.5 0.1355 0.2
 # build zero potential to force ghost atom creation
 pair_style      zero ${rcutfac}
 pair_style      zero 4.67637
 pair_coeff      * *
 # define atom and grid computes
 compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} 	 	${gaussian_options}
 compute 	mygridlocal all gaussian/grid/local grid 2 ${ngrid} ${ngrid} 	 	${gaussian_options}
 compute 	mygridlocal all gaussian/grid/local grid 2 2 ${ngrid} 	 	${gaussian_options}
 compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	${gaussian_options}
 compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	4.67637 0.5 0.5 0.1355 0.2
 # define output
 dump		1 all local 1000 dump.glocal c_mygridlocal[*]
 dump 		2 all custom 1000 dump.gatom id x y z
 compute		val1 all reduce max c_mygridlocal[7] inputs local
 compute		val2 all reduce max c_mygridlocal[8] inputs local
 thermo_style	custom step c_val1 c_val2
 # run
 run		0
 WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
 Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
 Neighbor list info ...
  update: every = 1 steps, delay = 0 steps, check = yes
  max neighbors/atom: 2000, page size: 100000
  master list distance cutoff = 6.67637
  ghost atom cutoff = 6.67637
  binsize = 3.338185, bins = 1 1 1
  1 neighbor lists, perpetual/occasional/extra = 1 0 0
  (1) pair zero, perpetual
      attributes: half, newton on
      pair build: half/bin/atomonly/newton
      stencil: half/bin/3d
      bin: standard
 WARNING: Proc sub-domain size < neighbor skin, could lead to lost atoms (src/domain.cpp:1202)
 Per MPI rank memory allocation (min/avg/max) = 3.522 | 3.523 | 3.524 Mbytes
   Step         c_val1         c_val2    
         0   25.521859      7.9367045    
 Loop time of 2.238e-06 on 4 procs for 0 steps with 2 atoms
 89.4% CPU use with 4 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
 Pair    | 0          | 0          | 0          |   0.0 |  0.00
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
 Comm    | 0          | 0          | 0          |   0.0 |  0.00
 Output  | 0          | 0          | 0          |   0.0 |  0.00
 Modify  | 0          | 0          | 0          |   0.0 |  0.00
 Other   |            | 2.238e-06  |            |       |100.00
 Nlocal:            0.5 ave           1 max           0 min
 Histogram: 2 0 0 0 0 0 0 0 0 2
 Nghost:          274.5 ave         275 max         274 min
 Histogram: 2 0 0 0 0 0 0 0 0 2
 Neighs:             16 ave          40 max           0 min
 Histogram: 2 0 0 0 0 0 1 0 0 1
 Total # of neighbors = 64
 Ave neighs/atom = 32
 Neighbor list builds = 0
 Dangerous builds = 0
 Total wall time: 0:00:00
--- a/fortran/lammps.f90
+++ b/fortran/lammps.f90
@ -126,6 +126,7 @@ MODULE LIBLAMMPS
    PROCEDURE :: set_variable           => lmp_set_variable
    PROCEDURE :: set_string_variable    => lmp_set_string_variable
    PROCEDURE :: set_internal_variable  => lmp_set_internal_variable
    PROCEDURE :: eval                   => lmp_eval
    PROCEDURE, PRIVATE :: lmp_gather_atoms_int
    PROCEDURE, PRIVATE :: lmp_gather_atoms_double
    GENERIC   :: gather_atoms           => lmp_gather_atoms_int, &
@ -618,7 +619,14 @@ MODULE LIBLAMMPS
      INTEGER(c_int) :: lammps_set_internal_variable
    END FUNCTION lammps_set_internal_variable
-    SUBROUTINE lammps_gather_atoms(handle, name, type, count, data) BIND(C)
+    FUNCTION lammps_eval(handle, expr) BIND(C)
      IMPORT :: c_ptr, c_double
      IMPLICIT NONE
      TYPE(c_ptr), VALUE :: handle, expr
      REAL(c_double) :: lammps_eval
    END FUNCTION lammps_eval
    SUBROUTINE lammps_gather_atoms(handle, name, TYPE, count, DATA) BIND(C)
      IMPORT :: c_int, c_ptr
      IMPLICIT NONE
      TYPE(c_ptr), VALUE :: handle, name, data
@ -1812,7 +1820,7 @@ CONTAINS
  SUBROUTINE lmp_set_internal_variable(self, name, val)
    CLASS(lammps), INTENT(IN) :: self
    CHARACTER(LEN=*), INTENT(IN) :: name
-    REAL(KIND=c_double), INTENT(IN) :: val
+    REAL(c_double), INTENT(IN) :: val
    INTEGER :: err
    TYPE(c_ptr) :: Cname
@ -1826,6 +1834,18 @@ CONTAINS
    END IF
  END SUBROUTINE lmp_set_internal_variable
  ! equivalent function to lammps_eval
  FUNCTION lmp_eval(self, expr)
    CLASS(lammps), INTENT(IN) :: self
    CHARACTER(LEN=*), INTENT(IN) :: expr
    REAL(c_double) :: lmp_eval
    TYPE(c_ptr) :: Cexpr
    Cexpr = f2c_string(expr)
    lmp_eval = lammps_eval(self%handle, Cexpr)
    CALL lammps_free(Cexpr)
  END FUNCTION lmp_eval
  ! equivalent function to lammps_gather_atoms (for integers)
  SUBROUTINE lmp_gather_atoms_int(self, name, count, data)
    CLASS(lammps), INTENT(IN) :: self
--- a/python/lammps/core.py
+++ b/python/lammps/core.py
@ -103,7 +103,7 @@ class command_wrapper(object):
    This method is where the Python 'magic' happens. If a method is not
    defined by the class command_wrapper, it assumes it is a LAMMPS command. It takes
    all the arguments, concatinates them to a single string, and executes it using
-    :py:meth:`lammps.command()`.
+    :py:meth:`lammps.command`.
    Starting with Python 3.6 it also supports keyword arguments. key=value is
    transformed into 'key value'. Note, since these have come last in the
@ -422,6 +422,9 @@ class lammps(object):
    self.lib.lammps_extract_variable_datatype.argtypes = [c_void_p, c_char_p]
    self.lib.lammps_extract_variable_datatype.restype = c_int
    self.lib.lammps_eval.argtypes = [c_void_p, c_char_p]
    self.lib.lammps_eval.restype = c_double
    self.lib.lammps_fix_external_get_force.argtypes = [c_void_p, c_char_p]
    self.lib.lammps_fix_external_get_force.restype = POINTER(POINTER(c_double))
@ -1671,6 +1674,30 @@ class lammps(object):
  # -------------------------------------------------------------------------
  def eval(self, expr):
    """ Evaluate a LAMMPS immediate variable expression
    .. versionadded:: TBD
    This function is a wrapper around the function :cpp:func:`lammps_eval`
    of the C library interface.  It evaluates and expression like in
    immediate variables and returns the value.
    :param expr: immediate variable expression
    :type name: string
    :return: the result of the evaluation
    :rtype: c_double
    """
    if expr: newexpr = expr.encode()
    else: return None
    with ExceptionCheck(self):
      return self.lib.lammps_eval(self.lmp, newexpr)
    return None
  # -------------------------------------------------------------------------
  # return vector of atom properties gathered across procs
  # 3 variants to match src/library.cpp
  # name = atom property recognized by LAMMPS in atom->extract()
--- a/python/lammps/ipython/wrapper.py
+++ b/python/lammps/ipython/wrapper.py
@ -17,7 +17,7 @@
 ################################################################################
 class wrapper(object):
-  """lammps API IPython Wrapper
+  """ lammps API IPython Wrapper
  This is a wrapper class that provides additional methods on top of an
  existing :py:class:`lammps` instance. It provides additional methods
--- a/src/.gitignore
+++ b/src/.gitignore
@ -95,6 +95,8 @@
 /angle_lepton.h
 /dihedral_lepton.cpp
 /dihedral_lepton.h
 /fix_efield_lepton.cpp
 /fix_efield_lepton.h
 /fix_wall_lepton.cpp
 /fix_wall_lepton.h
 /lepton_utils.cpp
@ -250,6 +252,8 @@
 /*rheo*.cpp
 /*rheo*.h
 /compute_gaussian_grid_local.cpp
 /compute_gaussian_grid_local.h
 /compute_grid.cpp
 /compute_grid.h
 /compute_grid_local.cpp
@ -1219,6 +1223,9 @@
 /pair_dipole_cut.h
 /pair_dipole_sf.cpp
 /pair_dipole_sf.h
 /d3_parameters.h
 /pair_dispersion_d3.h
 /pair_dispersion_d3.cpp
 /pair_dsmc.cpp
 /pair_dsmc.h
 /pair_e3b.cpp
--- a/src/AMOEBA/fix_amoeba_pitorsion.cpp
+++ b/src/AMOEBA/fix_amoeba_pitorsion.cpp
@ -773,9 +773,9 @@ bigint FixAmoebaPiTorsion::read_data_skip_lines(char *keyword)
 void FixAmoebaPiTorsion::write_data_header(FILE *fp, int mth)
 {
-  if (mth == 0) fmt::print(fp,"{} pitorsions\n",npitorsions);
+  if (mth == 0) utils::print(fp,"{} pitorsions\n",npitorsions);
  else if (mth == 1)
-    fmt::print(fp, "{} pitorsion types\n",npitorsion_types);
+    utils::print(fp, "{} pitorsion types\n",npitorsion_types);
 }
 /* ----------------------------------------------------------------------
--- a/src/BODY/body_nparticle.cpp
+++ b/src/BODY/body_nparticle.cpp
@ -261,22 +261,22 @@ int BodyNparticle::write_data_body(FILE *fp, double *buf)
  // atomID ninteger ndouble
-  fmt::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
+  utils::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
  m += 3;
  const int nsub = (int) ubuf(buf[m++]).i;
-  fmt::print(fp,"{}\n",nsub);
+  utils::print(fp,"{}\n",nsub);
  // inertia
-  fmt::print(fp,"{} {} {} {} {} {}\n",
+  utils::print(fp,"{} {} {} {} {} {}\n",
             buf[m+0],buf[m+1],buf[m+2],buf[m+3],buf[m+4],buf[m+5]);
  m += 6;
  // nsub vertices
  for (int i = 0; i < nsub; i++) {
-    fmt::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);
+    utils::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);
    m += 3;
  }
--- a/src/BODY/body_rounded_polygon.cpp
+++ b/src/BODY/body_rounded_polygon.cpp
@ -398,27 +398,27 @@ int BodyRoundedPolygon::write_data_body(FILE *fp, double *buf)
  // atomID ninteger ndouble
-  fmt::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
+  utils::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
  m += 3;
  const int nsub = (int) ubuf(buf[m++]).i;
-  fmt::print(fp,"{}\n",nsub);
+  utils::print(fp,"{}\n",nsub);
  // inertia
-  fmt::print(fp,"{} {} {} {} {} {}\n",
+  utils::print(fp,"{} {} {} {} {} {}\n",
             buf[m+0],buf[m+1],buf[m+2],buf[m+3],buf[m+4],buf[m+5]);
  m += 6;
  // nsub vertices
  for (int i = 0; i < nsub; i++, m+=3)
-    fmt::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);
+    utils::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);
  // rounded diameter
  double diameter = buf[m++];
-  fmt::print(fp,"{}\n",diameter);
+  utils::print(fp,"{}\n",diameter);
  return m;
 }
--- a/src/BODY/body_rounded_polyhedron.cpp
+++ b/src/BODY/body_rounded_polyhedron.cpp
@ -476,7 +476,7 @@ int BodyRoundedPolyhedron::write_data_body(FILE *fp, double *buf)
  // atomID ninteger ndouble
-  fmt::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
+  utils::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
  m += 3;
  // nvert, nedge, nface
@ -484,27 +484,27 @@ int BodyRoundedPolyhedron::write_data_body(FILE *fp, double *buf)
  const int nsub = (int) ubuf(buf[m++]).i;
  const int nedge = (int) ubuf(buf[m++]).i;
  const int nface = (int) ubuf(buf[m++]).i;
-  fmt::print(fp,"{} {} {}\n",nsub,nedge,nface);
+  utils::print(fp,"{} {} {}\n",nsub,nedge,nface);
  // inertia
-  fmt::print(fp,"{} {} {} {} {} {}\n",
+  utils::print(fp,"{} {} {} {} {} {}\n",
             buf[m+0],buf[m+1],buf[m+2],buf[m+3],buf[m+4],buf[m+5]);
  m += 6;
  // nsub vertices
  for (int i = 0; i < nsub; i++, m+=3)
-    fmt::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);
+    utils::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);
  // nedge 2-tuples and nface 4-tuples
  // unless nsub = 1 or 2
  if (nsub > 2) {
    for (int i = 0; i < nedge; i++, m+=2)
-      fmt::print(fp,"{} {}\n",static_cast<int> (buf[m]),static_cast<int> (buf[m+1]));
+      utils::print(fp,"{} {}\n",static_cast<int> (buf[m]),static_cast<int> (buf[m+1]));
    for (int i = 0; i < nface; i++, m+=4)
-      fmt::print(fp,"{} {} {} {}\n",
+      utils::print(fp,"{} {} {} {}\n",
                 static_cast<int> (buf[m]),static_cast<int> (buf[m+1]),
                 static_cast<int> (buf[m+2]),static_cast<int> (buf[m+3]));
  }
@ -512,7 +512,7 @@ int BodyRoundedPolyhedron::write_data_body(FILE *fp, double *buf)
  // rounded diameter
  double diameter = buf[m++];
-  fmt::print(fp,"{}\n",diameter);
+  utils::print(fp,"{}\n",diameter);
  return m;
 }
--- a/src/BODY/compute_temp_body.cpp
+++ b/src/BODY/compute_temp_body.cpp
@ -57,6 +57,7 @@ ComputeTempBody::ComputeTempBody(LAMMPS *lmp, int narg, char **arg) :
    if (strcmp(arg[iarg],"bias") == 0) {
      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "compute temp/body bias", error);
      tempbias = 1;
      delete[] id_bias;
      id_bias = utils::strdup(arg[iarg+1]);
      iarg += 2;
    } else if (strcmp(arg[iarg],"dof") == 0) {
--- a/src/BROWNIAN/fix_brownian_base.cpp
+++ b/src/BROWNIAN/fix_brownian_base.cpp
@ -33,8 +33,9 @@ using namespace LAMMPS_NS;
 using namespace FixConst;
 /* ---------------------------------------------------------------------- */
-
+FixBrownianBase::FixBrownianBase(LAMMPS *lmp, int narg, char **arg) :
-FixBrownianBase::FixBrownianBase(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
+    Fix(lmp, narg, arg), gamma_t_inv(nullptr), gamma_r_inv(nullptr), gamma_t_invsqrt(nullptr),
    gamma_r_invsqrt(nullptr), dipole_body(nullptr), rng(nullptr)
 {
  time_integrate = 1;
@ -47,18 +48,18 @@ FixBrownianBase::FixBrownianBase(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, n
  planar_rot_flag = 0;
  g2 = 0.0;
-  if (narg < 5) error->all(FLERR, "Illegal fix brownian command.");
+  if (narg < 5) utils::missing_cmd_args(FLERR, "fix brownian", error);
  temp = utils::numeric(FLERR, arg[3], false, lmp);
-  if (temp <= 0) error->all(FLERR, "Fix brownian temp must be > 0.");
+  if (temp <= 0) error->all(FLERR, "Fix brownian temp must be > 0.0");
  seed = utils::inumeric(FLERR, arg[4], false, lmp);
-  if (seed <= 0) error->all(FLERR, "Fix brownian seed must be > 0.");
+  if (seed <= 0) error->all(FLERR, "Fix brownian seed must be > 0");
  int iarg = 5;
  while (iarg < narg) {
    if (strcmp(arg[iarg], "rng") == 0) {
-      if (narg == iarg + 1) error->all(FLERR, "Illegal fix brownian command.");
+      if (narg < iarg + 1) utils::missing_cmd_args(FLERR, "fix brownian rng", error);
      if (strcmp(arg[iarg + 1], "uniform") == 0) {
        noise_flag = 1;
      } else if (strcmp(arg[iarg + 1], "gaussian") == 0) {
@ -67,13 +68,14 @@ FixBrownianBase::FixBrownianBase(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, n
      } else if (strcmp(arg[iarg + 1], "none") == 0) {
        noise_flag = 0;
      } else {
-        error->all(FLERR, "Illegal fix brownian command.");
+        error->all(FLERR, "Unknown fix brownian rng keyword {}", arg[iarg + 1]);
      }
      iarg = iarg + 2;
    } else if (strcmp(arg[iarg], "dipole") == 0) {
-      if (narg == iarg + 3) error->all(FLERR, "Illegal fix brownian command.");
+      if (narg < iarg + 3) utils::missing_cmd_args(FLERR, "fix brownian dipole", error);
      dipole_flag = 1;
      delete[] dipole_body;
      dipole_body = new double[3];
      dipole_body[0] = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
@ -82,9 +84,11 @@ FixBrownianBase::FixBrownianBase(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, n
      iarg = iarg + 4;
    } else if (strcmp(arg[iarg], "gamma_t_eigen") == 0) {
-      if (narg == iarg + 3) error->all(FLERR, "Illegal fix brownian command.");
+      if (narg < iarg + 3) utils::missing_cmd_args(FLERR, "fix brownian gamma_t_eigen", error);
      gamma_t_eigen_flag = 1;
      delete[] gamma_t_inv;
      delete[] gamma_t_invsqrt;
      gamma_t_inv = new double[3];
      gamma_t_invsqrt = new double[3];
      gamma_t_inv[0] = 1. / utils::numeric(FLERR, arg[iarg + 1], false, lmp);
@ -111,6 +115,8 @@ FixBrownianBase::FixBrownianBase(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, n
      if (narg == iarg + 3) error->all(FLERR, "Illegal fix brownian command.");
      gamma_r_eigen_flag = 1;
      delete[] gamma_r_inv;
      delete[] gamma_r_invsqrt;
      gamma_r_inv = new double[3];
      gamma_r_invsqrt = new double[3];
--- a/src/DPD-REACT/fix_rx.cpp
+++ b/src/DPD-REACT/fix_rx.cpp
@ -119,7 +119,7 @@ FixRX::FixRX(LAMMPS *lmp, int narg, char **arg) :
                    + " expected \"sparse\" or \"dense\"\n");
    if (comm->me == 0 && Verbosity > 1)
-      error->message(FLERR, fmt::format("FixRX: matrix format is {}",word));
+      error->message(FLERR, fmt::format("FixRX: matrix format is {}", word));
  }
  // Determine the ODE solver/stepper strategy in arg[6].
@ -157,7 +157,7 @@ FixRX::FixRX(LAMMPS *lmp, int narg, char **arg) :
    minSteps = utils::inumeric(FLERR,arg[iarg++],false,lmp);
    if (comm->me == 0 && Verbosity > 1)
-      error->message(FLERR,fmt::format("FixRX: RK4 numSteps= {}", minSteps));
+      error->message(FLERR, fmt::format("FixRX: RK4 numSteps= {}", minSteps));
  } else if (odeIntegrationFlag == ODE_LAMMPS_RK4 && narg>8) {
    error->all(FLERR,"Illegal fix rx command.  Too many arguments for RK4 solver.");
  } else if (odeIntegrationFlag == ODE_LAMMPS_RKF45) {
@ -307,12 +307,19 @@ void FixRX::post_constructor()
  id_fix_species = utils::strdup(std::string(id)+"_SPECIES");
  id_fix_species_old = utils::strdup(std::string(id)+"_SPECIES_OLD");
-  const std::string fmtstr = "{} {} property/atom ";
+  std::string newcmd1 = id_fix_species;
-  auto newcmd1 = fmt::format(fmtstr,id_fix_species,group->names[igroup]);
+  newcmd1 += " ";
-  auto newcmd2 = fmt::format(fmtstr,id_fix_species_old,group->names[igroup]);
+  newcmd1 += group->names[igroup];
  newcmd1 += " property/atom ";
  std::string newcmd2 = id_fix_species_old;
  newcmd2 += " ";
  newcmd2 += group->names[igroup];
  newcmd2 += " property/atom ";
  for (int ii=0; ii<nspecies; ii++) {
-    newcmd1 += fmt::format(" d_{}",tmpspecies[ii]);
+    newcmd1 += fmt::format(" d_{}", tmpspecies[ii]);
-    newcmd2 += fmt::format(" d_{}Old",tmpspecies[ii]);
+    newcmd2 += fmt::format(" d_{}Old", tmpspecies[ii]);
  }
  newcmd1 += " ghost yes";
  newcmd2 += " ghost yes";
--- a/src/ELECTRODE/fix_electrode_conp.cpp
+++ b/src/ELECTRODE/fix_electrode_conp.cpp
@ -1363,10 +1363,10 @@ int FixElectrodeConp::setmask()
 void FixElectrodeConp::write_to_file(FILE *file, const std::vector<tagint> &tags,
                                     const std::vector<std::vector<double>> &mat)
 {
-  for (const auto &t : tags) fmt::print(file, "{:20}", t);
+  for (const auto &t : tags) utils::print(file, "{:20}", t);
  fputs("\n", file);
  for (const auto &vec : mat) {
-    for (const auto &x : vec) fmt::print(file, "{:20.11e}", x);
+    for (const auto &x : vec) utils::print(file, "{:20.11e}", x);
    fputs("\n", file);
  }
 }
--- a/src/EXTRA-COMMAND/group2ndx.cpp
+++ b/src/EXTRA-COMMAND/group2ndx.cpp
@ -93,7 +93,7 @@ void Group2Ndx::write_group(FILE *fp, int gid)
    if (gid == 0) {
      fputs("[ System ]\n", fp);
    } else {
-      fmt::print(fp, "[ {} ]\n", group->names[gid]);
+      utils::print(fp, "[ {} ]\n", group->names[gid]);
    }
    width = log10((double) atom->natoms) + 2;
    cols = 80 / width;
@ -142,7 +142,7 @@ void Group2Ndx::write_group(FILE *fp, int gid)
  if (fp) {
    int i, j;
    for (i = 0, j = 0; i < gcount; ++i) {
-      fmt::print(fp, "{:>{}}", recvlist[i], width);
+      utils::print(fp, "{:>{}}", recvlist[i], width);
      ++j;
      if (j == cols) {
        fputs("\n", fp);
--- a/src/EXTRA-COMPUTE/compute_adf.cpp
+++ b/src/EXTRA-COMPUTE/compute_adf.cpp
@ -39,6 +39,8 @@ using MathConst::RAD2DEG;
 enum { DEGREE, RADIAN, COSINE };
 static constexpr double BIG = 1.0e20;
 /* ----------------------------------------------------------------------
   compute angular distribution functions for I, J, K atoms
 ---------------------------------------------------------------------- */
@ -133,15 +135,15 @@ ComputeADF::ComputeADF(LAMMPS *lmp, int narg, char **arg) :
      utils::bounds(FLERR,arg[iarg+1],1,atom->ntypes,jlo[m],jhi[m],error);
      utils::bounds(FLERR,arg[iarg+2],1,atom->ntypes,klo[m],khi[m],error);
      if ((ilo[m] > ihi[m]) || (jlo[m] > jhi[m]) || (klo[m] > khi[m]))
-        error->all(FLERR,"Illegal compute adf command");
+        error->all(FLERR,"Illegal compute adf command index range");
      rcutinnerj[m] = utils::numeric(FLERR,arg[iarg+3],false,lmp);
      rcutouterj[m] = utils::numeric(FLERR,arg[iarg+4],false,lmp);
      if (rcutinnerj[m] < 0.0 || rcutinnerj[m] >= rcutouterj[m])
-        error->all(FLERR,"Illegal compute adf command");
+        error->all(FLERR,"Illegal compute adf command j-cutoff");
      rcutinnerk[m] = utils::numeric(FLERR,arg[iarg+5],false,lmp);
      rcutouterk[m] = utils::numeric(FLERR,arg[iarg+6],false,lmp);
      if (rcutinnerk[m] < 0.0 || rcutinnerk[m] >= rcutouterk[m])
-        error->all(FLERR,"Illegal compute adf command");
+        error->all(FLERR,"Illegal compute adf command k-cutoff");
      iarg += nargsperadf;
    }
  }
@ -290,8 +292,8 @@ void ComputeADF::init()
    double skin = neighbor->skin;
    mycutneigh = maxouter + skin;
    if (mycutneigh > comm->cutghostuser)
-      error->all(FLERR,"Compute adf outer cutoff exceeds ghost atom range - "
+      error->all(FLERR,"Compute adf outer cutoff {} exceeds ghost atom range {} - "
-                 "use comm_modify cutoff command");
+                 "use comm_modify cutoff command", mycutneigh, comm->cutghostuser);
  }
  // assign ordinate values to 1st column of output array
@ -328,6 +330,7 @@ void ComputeADF::init()
  if (mycutneigh > 0.0) {
    if ((neighbor->style == Neighbor::MULTI) || (neighbor->style == Neighbor::MULTI_OLD))
      error->all(FLERR, "Compute adf with custom cutoffs requires neighbor style 'bin' or 'nsq'");
    req->set_cutoff(mycutneigh);
  }
 }
--- a/src/EXTRA-DUMP/dump_yaml.cpp
+++ b/src/EXTRA-DUMP/dump_yaml.cpp
@ -94,31 +94,31 @@ void DumpYAML::write_header(bigint ndump)
  if (comm->me == 0) {
    const std::string boundary(boundstr);
-    fmt::print(fp, "---\ncreator: LAMMPS\ntimestep: {}\n", update->ntimestep);
+    utils::print(fp, "---\ncreator: LAMMPS\ntimestep: {}\n", update->ntimestep);
-    if (unit_flag) fmt::print(fp, "units: {}\n", update->unit_style);
+    if (unit_flag) utils::print(fp, "units: {}\n", update->unit_style);
-    if (time_flag) fmt::print(fp, "time: {:.16g}\n", compute_time());
+    if (time_flag) utils::print(fp, "time: {:.16g}\n", compute_time());
-    fmt::print(fp, "natoms: {}\n", ndump);
+    utils::print(fp, "natoms: {}\n", ndump);
    fputs("boundary: [ ", fp);
    for (const auto &bflag : boundary) {
      if (bflag == ' ') continue;
-      fmt::print(fp, "{}, ", bflag);
+      utils::print(fp, "{}, ", bflag);
    }
    fputs("]\n", fp);
-    if (thermo) fmt::print(fp, thermo_data);
+    if (thermo) utils::print(fp, thermo_data);
-    fmt::print(fp, "box:\n  - [ {}, {} ]\n", boxxlo, boxxhi);
+    utils::print(fp, "box:\n  - [ {}, {} ]\n", boxxlo, boxxhi);
-    fmt::print(fp, "  - [ {}, {} ]\n", boxylo, boxyhi);
+    utils::print(fp, "  - [ {}, {} ]\n", boxylo, boxyhi);
-    fmt::print(fp, "  - [ {}, {} ]\n", boxzlo, boxzhi);
+    utils::print(fp, "  - [ {}, {} ]\n", boxzlo, boxzhi);
-    if (domain->triclinic) fmt::print(fp, "  - [ {}, {}, {} ]\n", boxxy, boxxz, boxyz);
+    if (domain->triclinic) utils::print(fp, "  - [ {}, {}, {} ]\n", boxxy, boxxz, boxyz);
-    fmt::print(fp, "keywords: [ ");
+    utils::print(fp, "keywords: [ ");
    for (const auto &item : utils::split_words(columns)) {
      if (item.find_first_of(special_chars) == std::string::npos)
-        fmt::print(fp, "{}, ", item);
+        utils::print(fp, "{}, ", item);
      else
-        fmt::print(fp, "'{}', ", item);
+        utils::print(fp, "'{}', ", item);
    }
    fputs(" ]\ndata:\n", fp);
  } else    // reset so that the remainder of the output is not multi-proc
--- a/src/EXTRA-FIX/fix_ave_correlate_long.cpp
+++ b/src/EXTRA-FIX/fix_ave_correlate_long.cpp
@ -489,7 +489,7 @@ void FixAveCorrelateLong::end_of_step()
  if (fp && comm->me == 0) {
    clearerr(fp);
    if (overwrite) (void) platform::fseek(fp,filepos);
-    fmt::print(fp,"# Timestep: {}\n", ntimestep);
+    utils::print(fp,"# Timestep: {}\n", ntimestep);
    for (unsigned int i=0; i < npcorr; ++i) {
      fprintf(fp, "%lg ", t[i]*update->dt*nevery);
      for (int j=0; j < npair; ++j) {
--- a/src/EXTRA-FIX/fix_tmd.cpp
+++ b/src/EXTRA-FIX/fix_tmd.cpp
@ -270,7 +270,7 @@ void FixTMD::initial_integrate(int /*vflag*/)
    work_lambda += lambda*(rho_target - rho_old);
    if (!(update->ntimestep % nfileevery) &&
        (previous_stat != update->ntimestep)) {
-      fmt::print(fp, "{} {} {} {} {} {} {} {}\n", update->ntimestep,rho_target,rho_old,
+      utils::print(fp, "{} {} {} {} {} {} {} {}\n", update->ntimestep,rho_target,rho_old,
                 gamma_back,gamma_forward,lambda,work_lambda,work_analytical);
      fflush(fp);
      previous_stat = update->ntimestep;
--- a/src/EXTRA-FIX/fix_ttm.cpp
+++ b/src/EXTRA-FIX/fix_ttm.cpp
@ -523,7 +523,7 @@ void FixTTM::write_electron_temperatures(const std::string &filename)
  FILE *fp = fopen(filename.c_str(),"w");
  if (!fp) error->one(FLERR,"Fix ttm could not open output file {}: {}",
                      filename,utils::getsyserror());
-  fmt::print(fp,"# DATE: {} UNITS: {} COMMENT: Electron temperature on "
+  utils::print(fp,"# DATE: {} UNITS: {} COMMENT: Electron temperature on "
             "{}x{}x{} grid at step {} - created by fix {}\n", utils::current_date(),
             update->unit_style, nxgrid, nygrid, nzgrid, update->ntimestep, style);
--- a/src/EXTRA-FIX/fix_ttm_grid.cpp
+++ b/src/EXTRA-FIX/fix_ttm_grid.cpp
@ -411,7 +411,7 @@ void FixTTMGrid::write_restart_file(const char *file)
    if (fpout == nullptr)
      error->one(FLERR,"Cannot open fix ttm/grid restart file {}: {}",outfile,utils::getsyserror());
-    fmt::print(fpout,"# DATE: {} UNITS: {} COMMENT: "
+    utils::print(fpout,"# DATE: {} UNITS: {} COMMENT: "
               "Electron temperature on {}x{}x{} grid at step {} - "
               "created by fix {}\n",
               utils::current_date(),update->unit_style,
--- a/src/EXTRA-FIX/fix_ttm_mod.cpp
+++ b/src/EXTRA-FIX/fix_ttm_mod.cpp
@ -74,14 +74,13 @@ static constexpr double SHIFT = 0.0;
 /* ---------------------------------------------------------------------- */
 FixTTMMod::FixTTMMod(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg),
+    Fix(lmp, narg, arg), infile(nullptr), outfile(nullptr), random(nullptr), gfactor1(nullptr),
-  random(nullptr), gfactor1(nullptr), gfactor2(nullptr), ratio(nullptr), flangevin(nullptr),
+    gfactor2(nullptr), ratio(nullptr), flangevin(nullptr), T_electron(nullptr),
-  T_electron(nullptr), T_electron_old(nullptr), net_energy_transfer(nullptr),
+    T_electron_old(nullptr), net_energy_transfer(nullptr), net_energy_transfer_all(nullptr)
  net_energy_transfer_all(nullptr)
 {
  if (lmp->citeme) lmp->citeme->add(cite_fix_ttm_mod);
-  if (narg < 8) error->all(FLERR,"Illegal fix ttm/mod command");
+  if (narg < 8) utils::missing_cmd_args(FLERR, "fix ttm/mod", error);
  vector_flag = 1;
  size_vector = 2;
@ -103,27 +102,29 @@ FixTTMMod::FixTTMMod(LAMMPS *lmp, int narg, char **arg) :
  int iarg = 8;
  while (iarg < narg) {
    if (strcmp(arg[iarg],"set") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix ttm/mod command");
+      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ttm/mod set", error);
      tinit = utils::numeric(FLERR,arg[iarg+1],false,lmp);
      if (tinit <= 0.0)
        error->all(FLERR,"Fix ttm/mod initial temperature must be > 0.0");
      iarg += 2;
    } else if (strcmp(arg[iarg],"infile") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix ttm/mod command");
+      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ttm/mod infile", error);
      delete[] infile;
      infile = utils::strdup(arg[iarg+1]);
      iarg += 2;
    } else if (strcmp(arg[iarg],"outfile") == 0) {
-      if (iarg+3 > narg) error->all(FLERR,"Illegal fix ttm/mod command");
+      if (iarg+3 > narg) utils::missing_cmd_args(FLERR, "fix ttm/mod outfile", error);
      delete[] outfile;
      outevery = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
      outfile = utils::strdup(arg[iarg+2]);
      iarg += 3;
-    } else error->all(FLERR,"Illegal fix ttm/mod command");
+    } else error->all(FLERR,"Unknown fix ttm/mod keyword {}", arg[iarg]);
  }
  // error check
  if (seed <= 0)
-    error->all(FLERR,"Invalid random number seed in fix ttm/mod command");
+    error->all(FLERR,"Invalid random number seed {} in fix ttm/mod command", seed);
  if (nxgrid <= 0 || nygrid <= 0 || nzgrid <= 0)
    error->all(FLERR,"Fix ttm/mod grid sizes must be > 0");
@ -152,7 +153,8 @@ FixTTMMod::FixTTMMod(LAMMPS *lmp, int narg, char **arg) :
  if (v_0 < 0.0) error->all(FLERR,"Fix ttm/mod v_0 must be >= 0.0");
  if (ionic_density <= 0.0) error->all(FLERR,"Fix ttm/mod ionic_density must be > 0.0");
  if (surface_l < 0) error->all(FLERR,"Surface coordinates must be >= 0");
-  if (surface_l >= surface_r) error->all(FLERR, "Left surface coordinate must be less than right surface coordinate");
+  if (surface_l >= surface_r)
    error->all(FLERR, "Left surface coordinate must be less than right surface coordinate");
  // initialize Marsaglia RNG with processor-unique seed
@ -168,10 +170,8 @@ FixTTMMod::FixTTMMod(LAMMPS *lmp, int narg, char **arg) :
  memory->create(T_electron_old,nzgrid,nygrid,nxgrid,"ttm/mod:T_electron_old");
  memory->create(T_electron_first,nzgrid,nygrid,nxgrid,"ttm/mod:T_electron_first");
  memory->create(T_electron,nzgrid,nygrid,nxgrid,"ttm/mod:T_electron");
-  memory->create(net_energy_transfer,nzgrid,nygrid,nxgrid,
+  memory->create(net_energy_transfer,nzgrid,nygrid,nxgrid,"ttm/mod:net_energy_transfer");
-                 "ttm/mod:net_energy_transfer");
+  memory->create(net_energy_transfer_all,nzgrid,nygrid,nxgrid,"ttm/mod:net_energy_transfer_all");
  memory->create(net_energy_transfer_all,nzgrid,nygrid,nxgrid,
                 "ttm/mod:net_energy_transfer_all");
  flangevin = nullptr;
  grow_arrays(atom->nmax);
@ -628,7 +628,7 @@ void FixTTMMod::write_electron_temperatures(const std::string &filename)
  FILE *fp = fopen(filename.c_str(),"w");
  if (!fp) error->one(FLERR,"Fix ttm/mod could not open output file {}: {}",
                      filename, utils::getsyserror());
-  fmt::print(fp,"# DATE: {} UNITS: {} COMMENT: Electron temperature "
+  utils::print(fp,"# DATE: {} UNITS: {} COMMENT: Electron temperature "
             "{}x{}x{} grid at step {}. Created by fix {}\n", utils::current_date(),
             update->unit_style, nxgrid, nygrid, nzgrid, update->ntimestep, style);
--- a/src/EXTRA-MOLECULE/angle_gaussian.cpp
+++ b/src/EXTRA-MOLECULE/angle_gaussian.cpp
@ -29,7 +29,7 @@ using namespace LAMMPS_NS;
 using namespace MathConst;
 static constexpr double SMALL = 0.001;
-static constexpr double SMALLG = 2.0e-308;
+static constexpr double SMALLG = 2.3e-308;
 /* ---------------------------------------------------------------------- */
--- a/src/EXTRA-MOLECULE/bond_gaussian.cpp
+++ b/src/EXTRA-MOLECULE/bond_gaussian.cpp
@ -27,7 +27,7 @@
 using namespace LAMMPS_NS;
 using namespace MathConst;
-static constexpr double SMALL = 2.0e-308;
+static constexpr double SMALL = 2.3e-308;
 /* ---------------------------------------------------------------------- */
--- a/src/EXTRA-PAIR/d3_parameters.h
+++ b/src/EXTRA-PAIR/d3_parameters.h
--- a/src/EXTRA-PAIR/pair_dispersion_d3.cpp
+++ b/src/EXTRA-PAIR/pair_dispersion_d3.cpp
--- a/src/EXTRA-PAIR/pair_dispersion_d3.h
+++ b/src/EXTRA-PAIR/pair_dispersion_d3.h
@ -0,0 +1,81 @@
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef PAIR_CLASS
 // clang-format off
 PairStyle(dispersion/d3,PairDispersionD3);
 // clang-format on
 #else
 #ifndef LMP_PAIR_DISPERSION_D3_H
 #define LMP_PAIR_DISPERSION_D3_H
 #include "pair.h"
 namespace LAMMPS_NS {
 class PairDispersionD3 : public Pair {
 public:
  PairDispersionD3(class LAMMPS *);
  ~PairDispersionD3() override;
  void compute(int, int) override;
  void settings(int, char **) override;
  void coeff(int, char **) override;
  void init_style() override;
  double init_one(int, int) override;
  int pack_forward_comm(int, int *, double *, int, int *) override;
  int pack_reverse_comm(int, int, double *) override;
  void unpack_forward_comm(int, int, double *) override;
  void unpack_reverse_comm(int, int *, double *) override;
 protected:
  int nmax;
  double rthr;      // R^2 distance to cutoff for D3_calculation
  double cn_thr;    // R^2 distance to cutoff for CN_calculation
  std::string damping_type;              // damping function type
  double s6, s8, s18, rs6, rs8, rs18;    // XC parameters
  double a1, a2, alpha, alpha6, alpha8;
  double *r2r4;        // scale r4/r2 values of the atoms by sqrt(Z)
  double *rcov;        // covalent radii
  int *mxci;           // How large the grid for c6 interpolation
  double **r0ab;       // cut-off radii for all element pairs
  double *****c6ab;    // C6 for all element pairs
  double *cn;          // Coordination numbers
  double *dc6;         // dC6i(iat) saves dE_dsp/dCN(iat)
  int communicationStage;    // communication stage
  void allocate();
  virtual void set_funcpar(std::string &);
  void calc_coordination_number();
  int find_atomic_number(std::string &);
  std::vector<int> is_int_in_array(int *, int, int);
  void read_r0ab(int *, int);
  void set_limit_in_pars_array(int &, int &, int &, int &);
  void read_c6ab(int *, int);
  double *get_dC6(int, int, double, double);
 };
 }    // namespace LAMMPS_NS
 #endif
 #endif
--- a/src/GRANULAR/fix_wall_gran.cpp
+++ b/src/GRANULAR/fix_wall_gran.cpp
@ -163,6 +163,7 @@ FixWallGran::FixWallGran(LAMMPS *lmp, int narg, char **arg) :
  } else if (strcmp(arg[iarg],"region") == 0) {
    if (narg < iarg+2) error->all(FLERR,"Illegal fix wall/gran command");
    wallstyle = REGION;
    delete[] idregion;
    idregion = utils::strdup(arg[iarg+1]);
    iarg += 2;
    // This option is only compatible with fix wall/gran/region
@ -205,6 +206,7 @@ FixWallGran::FixWallGran(LAMMPS *lmp, int narg, char **arg) :
    } else if (strcmp(arg[iarg],"temperature") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal fix wall/gran command");
      if (utils::strmatch(arg[iarg+1], "^v_")) {
        delete[] tstr;
        tstr = utils::strdup(arg[iarg+1] + 2);
      } else {
        Twall = utils::numeric(FLERR,arg[iarg+1],false,lmp);
--- a/src/INTEL/verlet_lrt_intel.cpp
+++ b/src/INTEL/verlet_lrt_intel.cpp
@ -94,7 +94,7 @@ void VerletLRTIntel::setup(int flag)
  if (comm->me == 0 && screen) {
    fputs("Setting up VerletLRTIntel run ...\n",screen);
    if (flag) {
-      fmt::print(screen,"  Unit style    : {}\n"
+      utils::print(screen,"  Unit style    : {}\n"
                        "  Current step  : {}\n"
                        "  Time step     : {}\n",
                 update->unit_style,update->ntimestep,update->dt);
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@ -119,6 +119,14 @@ action compute_composition_atom_kokkos.cpp compute_composition_atom.cpp
 action compute_composition_atom_kokkos.h compute_composition_atom.h
 action compute_orientorder_atom_kokkos.cpp
 action compute_orientorder_atom_kokkos.h
 action compute_sna_grid_kokkos.cpp compute_sna_grid.cpp
 action compute_sna_grid_kokkos.h compute_sna_grid.h
 action compute_sna_grid_kokkos_impl.h compute_sna_grid.cpp
 action compute_sna_grid_local_kokkos.cpp compute_sna_grid_local.cpp
 action compute_sna_grid_local_kokkos.h compute_sna_grid_local.h
 action compute_sna_grid_local_kokkos_impl.h compute_sna_grid_local.cpp
 action compute_gaussian_grid_local_kokkos.cpp compute_gaussian_grid_local.cpp
 action compute_gaussian_grid_local_kokkos.h compute_gaussian_grid_local.h
 action compute_temp_deform_kokkos.cpp
 action compute_temp_deform_kokkos.h
 action compute_temp_kokkos.cpp
@ -230,7 +238,6 @@ action fix_wall_region_kokkos.cpp
 action fix_wall_region_kokkos.h
 action grid3d_kokkos.cpp fft3d.h
 action grid3d_kokkos.h fft3d.h
 action group_kokkos.cpp
 action group_kokkos.h
 action improper_class2_kokkos.cpp improper_class2.cpp
 action improper_class2_kokkos.h improper_class2.h
--- a/src/KOKKOS/angle_harmonic_kokkos.cpp
+++ b/src/KOKKOS/angle_harmonic_kokkos.cpp
@ -72,14 +72,14 @@ void AngleHarmonicKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
  // reallocate per-atom arrays if necessary
  if (eflag_atom) {
-    if(k_eatom.extent(0) < maxeatom) {
+    if ((int)k_eatom.extent(0) < maxeatom) {
    memoryKK->destroy_kokkos(k_eatom,eatom);
    memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"angle:eatom");
    d_eatom = k_eatom.template view<DeviceType>();
    } else Kokkos::deep_copy(d_eatom,0.0);
  }
  if (vflag_atom) {
-    if(k_vatom.extent(0) < maxvatom) {
+    if ((int)k_vatom.extent(0) < maxvatom) {
    memoryKK->destroy_kokkos(k_vatom,vatom);
    memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"angle:vatom");
    d_vatom = k_vatom.template view<DeviceType>();
--- a/src/KOKKOS/angle_hybrid_kokkos.cpp
+++ b/src/KOKKOS/angle_hybrid_kokkos.cpp
@ -76,7 +76,7 @@ void AngleHybridKokkos::compute(int eflag, int vflag)
    Kokkos::parallel_for(nanglelist_orig,LAMMPS_LAMBDA(int i) {
      const int m = d_map[d_anglelist_orig(i,3)];
-      if (m >= 0) Kokkos::atomic_increment(&d_nanglelist[m]);
+      if (m >= 0) Kokkos::atomic_inc(&d_nanglelist[m]);
    });
    k_nanglelist.modify_device();
@ -87,7 +87,7 @@ void AngleHybridKokkos::compute(int eflag, int vflag)
      if (h_nanglelist[m] > maxangle_all)
        maxangle_all = h_nanglelist[m] + EXTRA;
-    if (k_anglelist.d_view.extent(1) < maxangle_all)
+    if ((int)k_anglelist.d_view.extent(1) < maxangle_all)
      MemKK::realloc_kokkos(k_anglelist, "angle_hybrid:anglelist", nstyles, maxangle_all, 4);
    auto d_anglelist = k_anglelist.d_view;
--- a/src/KOKKOS/bond_harmonic_kokkos.cpp
+++ b/src/KOKKOS/bond_harmonic_kokkos.cpp
@ -67,14 +67,14 @@ void BondHarmonicKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
  // reallocate per-atom arrays if necessary
  if (eflag_atom) {
-    if (k_eatom.extent(0) < maxeatom) {
+    if ((int)k_eatom.extent(0) < maxeatom) {
      memoryKK->destroy_kokkos(k_eatom,eatom);
      memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"improper:eatom");
      d_eatom = k_eatom.template view<KKDeviceType>();
    } else Kokkos::deep_copy(d_eatom,0.0);
  }
  if (vflag_atom) {
-    if (k_vatom.extent(0) < maxvatom) {
+    if ((int)k_vatom.extent(0) < maxvatom) {
      memoryKK->destroy_kokkos(k_vatom,vatom);
      memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"improper:vatom");
      d_vatom = k_vatom.template view<KKDeviceType>();
--- a/src/KOKKOS/bond_hybrid_kokkos.cpp
+++ b/src/KOKKOS/bond_hybrid_kokkos.cpp
@ -76,7 +76,7 @@ void BondHybridKokkos::compute(int eflag, int vflag)
    Kokkos::parallel_for(nbondlist_orig,LAMMPS_LAMBDA(int i) {
      const int m = d_map[d_bondlist_orig(i,2)];
-      if (m >= 0) Kokkos::atomic_increment(&d_nbondlist[m]);
+      if (m >= 0) Kokkos::atomic_inc(&d_nbondlist[m]);
    });
    k_nbondlist.modify_device();
@ -87,7 +87,7 @@ void BondHybridKokkos::compute(int eflag, int vflag)
      if (h_nbondlist[m] > maxbond_all)
        maxbond_all = h_nbondlist[m] + EXTRA;
-    if (k_bondlist.d_view.extent(1) < maxbond_all)
+    if ((int)k_bondlist.d_view.extent(1) < maxbond_all)
      MemKK::realloc_kokkos(k_bondlist, "bond_hybrid:bondlist", nstyles, maxbond_all, 3);
    auto d_bondlist = k_bondlist.d_view;
--- a/src/KOKKOS/comm_tiled_kokkos.cpp
+++ b/src/KOKKOS/comm_tiled_kokkos.cpp
@ -37,6 +37,8 @@ static constexpr int BUFEXTRA = 1000;
 CommTiledKokkos::CommTiledKokkos(LAMMPS *_lmp) : CommTiled(_lmp)
 {
  sendlist = nullptr;
  maxsendlist = nullptr;
  nprocmaxtot = 0;
 }
 /* ---------------------------------------------------------------------- */
@ -49,6 +51,8 @@ CommTiledKokkos::CommTiledKokkos(LAMMPS *_lmp) : CommTiled(_lmp)
 CommTiledKokkos::CommTiledKokkos(LAMMPS *_lmp, Comm *oldcomm) : CommTiled(_lmp,oldcomm)
 {
  sendlist = nullptr;
  maxsendlist = nullptr;
  nprocmaxtot = 0;
 }
 /* ---------------------------------------------------------------------- */
@ -56,7 +60,9 @@ CommTiledKokkos::CommTiledKokkos(LAMMPS *_lmp, Comm *oldcomm) : CommTiled(_lmp,o
 CommTiledKokkos::~CommTiledKokkos()
 {
  memoryKK->destroy_kokkos(k_sendlist,sendlist);
  memory->destroy(maxsendlist);
  sendlist = nullptr;
  maxsendlist = nullptr;
  buf_send = nullptr;
  buf_recv = nullptr;
 }
@ -657,12 +663,11 @@ void CommTiledKokkos::grow_list(int iswap, int iwhich, int n)
  k_sendlist.sync<LMPHostType>();
  k_sendlist.modify<LMPHostType>();
-  if (size > (int)k_sendlist.extent(2)) {
+  memoryKK->grow_kokkos(k_sendlist,sendlist,maxswap,nprocmaxtot,size,"comm:sendlist");
    memoryKK->grow_kokkos(k_sendlist,sendlist,maxswap,maxsend,size,"comm:sendlist");
-    for (int i = 0; i < maxswap; i++)
+  for (int i = 0; i < maxswap; i++)
-      maxsendlist[iswap][iwhich] = size;
+    for (int j = 0; j < nprocmaxtot; j++)
-  }
+      maxsendlist[i][j] = size;
 }
 /* ----------------------------------------------------------------------
@ -692,24 +697,23 @@ void CommTiledKokkos::grow_swap_send(int i, int n, int /*nold*/)
  memory->destroy(sendbox_multiold[i]);
  memory->create(sendbox_multiold[i],n,atom->ntypes+1,6,"comm:sendbox_multiold");
-  delete [] maxsendlist[i];
+  if (sendlist && !k_sendlist.h_view.data()) {
  maxsendlist[i] = new int[n];
  for (int j = 0; j < n; j++)
    maxsendlist[i][j] = BUFMIN;
  if (sendlist && !k_sendlist.d_view.data()) {
    for (int ii = 0; ii < maxswap; ii++) {
      if (sendlist[ii]) {
        for (int jj = 0; jj < nprocmax[ii]; jj++)
          memory->destroy(sendlist[ii][jj]);
        delete [] sendlist[ii];
      }
    }
    delete [] sendlist;
    delete [] maxsendlist;
    sendlist = nullptr;
    maxsendlist = nullptr;
  } else {
    memoryKK->destroy_kokkos(k_sendlist,sendlist);
    memory->destroy(maxsendlist);
  }
-  memoryKK->create_kokkos(k_sendlist,sendlist,maxswap,n,BUFMIN,"comm:sendlist");
+  nprocmaxtot = MAX(nprocmaxtot,n);
  memoryKK->create_kokkos(k_sendlist,sendlist,maxswap,nprocmaxtot,BUFMIN,"comm:sendlist");
  memory->create(maxsendlist,maxswap,nprocmaxtot,"comm:maxsendlist");
  for (int i = 0; i < maxswap; i++)
    for (int j = 0; j < nprocmaxtot; j++)
      maxsendlist[i][j] = BUFMIN;
 }
--- a/src/KOKKOS/comm_tiled_kokkos.h
+++ b/src/KOKKOS/comm_tiled_kokkos.h
@ -64,18 +64,17 @@ class CommTiledKokkos : public CommTiled {
  template<class DeviceType> void reverse_comm_device();
 protected:
  int nprocmaxtot;
  DAT::tdual_int_3d k_sendlist;
  //DAT::tdual_int_scalar k_total_send;
  DAT::tdual_xfloat_2d k_buf_send,k_buf_recv;
  //DAT::tdual_int_scalar k_count;
-  void grow_send(int, int) override;
+  void grow_send(int, int) override;             // reallocate send buffer
-  void grow_recv(int, int flag = 0) override;
+  void grow_recv(int, int flag = 0) override;    // free/allocate recv buffer
  void grow_send_kokkos(int, int, ExecutionSpace space = Host);
  void grow_recv_kokkos(int, int, ExecutionSpace space = Host);
-  void grow_list(int, int, int) override;
+  void grow_list(int, int, int) override;        // reallocate sendlist for one swap/proc
-  void grow_swap_send(int, int, int) override;     // grow swap arrays for send and recv
+  void grow_swap_send(int, int, int) override;   // grow swap arrays for send and recv
 };
 }    // namespace LAMMPS_NS
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
@ -0,0 +1,327 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author: Drew Rohskopf (SNL)
 ------------------------------------------------------------------------- */
 #include "compute_gaussian_grid_local_kokkos.h"
 #include "atom_kokkos.h"
 #include "atom_masks.h"
 #include "comm.h"
 #include "domain.h"
 #include "error.h"
 #include "force.h"
 #include "memory_kokkos.h"
 #include "modify.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "neighbor_kokkos.h"
 #include "pair.h"
 #include "update.h"
 #include <cmath>
 #include <cstring>
 using namespace LAMMPS_NS;
 /* ---------------------------------------------------------------------- */
 template<class DeviceType>
 ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMPS *lmp, int narg, char **arg) :
  ComputeGaussianGridLocal(lmp, narg, arg)
 {
  kokkosable = 1;
  atomKK = (AtomKokkos *) atom;
  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
  datamask_read = EMPTY_MASK;
  datamask_modify = EMPTY_MASK;
  k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
  auto d_cutsq = k_cutsq.template view<DeviceType>();
  rnd_cutsq = d_cutsq;
  host_flag = (execution_space == Host);
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = 1; j <= atom->ntypes; j++){
      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq[i][j]; //cutsq_tmp;
      k_cutsq.template modify<LMPHostType>();
    }
  }
  // Set up element lists
  int n = atom->ntypes;
  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",n);
  MemKK::realloc_kokkos(d_sigmaelem,"ComputeSNAGridKokkos::sigmaelem",n+1);
  MemKK::realloc_kokkos(d_prefacelem,"ComputeSNAGridKokkos::prefacelem",n+1);
  MemKK::realloc_kokkos(d_argfacelem,"ComputeSNAGridKokkos::argfacelem",n+1);
  MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
  auto h_sigmaelem = Kokkos::create_mirror_view(d_sigmaelem);
  auto h_prefacelem = Kokkos::create_mirror_view(d_prefacelem);
  auto h_argfacelem = Kokkos::create_mirror_view(d_argfacelem);
  auto h_map = Kokkos::create_mirror_view(d_map);
  // start from index 1 because of how compute sna/grid is
  for (int i = 1; i <= atom->ntypes; i++) {
    h_radelem(i-1) = radelem[i];
    h_sigmaelem(i-1) = sigmaelem[i];
    h_prefacelem(i-1) = prefacelem[i];
    h_argfacelem(i-1) = argfacelem[i];
  }
  Kokkos::deep_copy(d_radelem,h_radelem);
  Kokkos::deep_copy(d_sigmaelem,h_sigmaelem);
  Kokkos::deep_copy(d_prefacelem, h_prefacelem);
  Kokkos::deep_copy(d_argfacelem, h_argfacelem);
  Kokkos::deep_copy(d_map,h_map);
 }
 /* ---------------------------------------------------------------------- */
 template<class DeviceType>
 ComputeGaussianGridLocalKokkos<DeviceType>::~ComputeGaussianGridLocalKokkos()
 {
  if (copymode) return;
  memoryKK->destroy_kokkos(k_cutsq,cutsq);
  memoryKK->destroy_kokkos(k_alocal,alocal);
  //gridlocal_allocated = 0;
 }
 /* ---------------------------------------------------------------------- */
 template<class DeviceType>
 void ComputeGaussianGridLocalKokkos<DeviceType>::setup()
 {
  ComputeGridLocal::setup();
  // allocate arrays
  memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal");
  array_local = alocal;
  d_alocal = k_alocal.template view<DeviceType>();
 }
 /* ---------------------------------------------------------------------- */
 template<class DeviceType>
 void ComputeGaussianGridLocalKokkos<DeviceType>::init()
 {
  ComputeGaussianGridLocal::init();
 }
 /* ---------------------------------------------------------------------- */
 template<class DeviceType>
 void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
 {
  if (host_flag) {
    return;
  }
  invoked_local = update->ntimestep;
  copymode = 1;
  zlen = nzhi-nzlo+1;
  ylen = nyhi-nylo+1;
  xlen = nxhi-nxlo+1;
  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
  x = atomKK->k_x.view<DeviceType>();
  type = atomKK->k_type.view<DeviceType>();
  k_cutsq.template sync<DeviceType>();
  // max_neighs is defined here - think of more elaborate methods.
  max_neighs = 100;
  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
  // number of atoms.
  ntotal = atomKK->nlocal + atomKK->nghost;
  // Allocate view for number of neighbors per grid point
  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
  // `total_range` is the number of grid points which may be larger than chunk size.
  // printf(">>> total_range: %d\n", total_range);
  chunksize = 32768; // 100*32768
  chunk_size = MIN(chunksize, total_range);
  chunk_offset = 0;
  int vector_length_default = 1;
  int team_size_default = 1;
  if (!host_flag)
    team_size_default = 1; // cost will increase with increasing team size //32;//max_neighs;
  if (triclinic){
    h0 = domain->h[0];
    h1 = domain->h[1];
    h2 = domain->h[2];
    h3 = domain->h[3];
    h4 = domain->h[4];
    h5 = domain->h[5];
    lo0 = domain->boxlo[0];
    lo1 = domain->boxlo[1];
    lo2 = domain->boxlo[2];
  }
  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
    if (chunk_size > total_range - chunk_offset)
      chunk_size = total_range - chunk_offset;
    //Neigh
    {
      int vector_length = vector_length_default;
      int team_size = team_size_default;
      check_team_size_for<TagComputeGaussianGridLocalNeigh>(chunk_size,team_size,vector_length);
      typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh> policy_neigh(chunk_size,team_size,vector_length);
      Kokkos::parallel_for("ComputeGaussianGridLocalNeigh",policy_neigh,*this);
    }
    // Proceed to the next chunk.
    chunk_offset += chunk_size;
  } // end while
  copymode = 0;
  k_alocal.template modify<DeviceType>();
  k_alocal.template sync<LMPHostType>();
 }
 /* ---------------------------------------------------------------------- */
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void ComputeGaussianGridLocalKokkos<DeviceType>::operator() (TagComputeGaussianGridLocalNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh>::member_type& team) const
 {
  const int ii = team.league_rank();
  if (ii >= chunk_size) return;
  // extract grid index
  int igrid = ii + chunk_offset;
  // get a pointer to scratch memory
  // This is used to cache whether or not an atom is within the cutoff.
  // If it is, type_cache is assigned to the atom type.
  // If it's not, it's assigned to -1.
  const int tile_size = ntotal; //max_neighs; // number of elements per thread
  const int team_rank = team.team_rank();
  const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
  int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
  // convert to grid indices
  int iz = igrid/(xlen*ylen);
  int i2 = igrid - (iz*xlen*ylen);
  int iy = i2/xlen;
  int ix = i2 % xlen;
  iz += nzlo;
  iy += nylo;
  ix += nxlo;
  double xgrid[3];
  // index ii already captures the proper grid point
  //int igrid = iz * (nx * ny) + iy * nx + ix;
  // grid2x converts igrid to ix,iy,iz like we've done before
  // multiply grid integers by grid spacing delx, dely, delz
  //grid2x(igrid, xgrid);
  xgrid[0] = ix * delx;
  xgrid[1] = iy * dely;
  xgrid[2] = iz * delz;
  if (triclinic) {
    // Do a conversion on `xgrid` here like we do in the CPU version.
    // Can't do this:
    // domainKK->lamda2x(xgrid, xgrid);
    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
    xgrid[2] = h2*xgrid[2] + lo2;
  }
  const F_FLOAT xtmp = xgrid[0];
  const F_FLOAT ytmp = xgrid[1];
  const F_FLOAT ztmp = xgrid[2];
  // Zeroing out the components, which are filled as a sum.
  for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
    d_alocal(igrid, icol) = 0.0;
  }
  // Fill grid info columns
  d_alocal(igrid, 0) = ix;
  d_alocal(igrid, 1) = iy;
  d_alocal(igrid, 2) = iz;
  d_alocal(igrid, 3) = xtmp;
  d_alocal(igrid, 4) = ytmp;
  d_alocal(igrid, 5) = ztmp;
  // currently, all grid points are type 1
  // not clear what a better choice would be
  const int itype = 1;
  int ielem = 0;
  ielem = d_map[itype];
  const double radi = d_radelem[ielem];
  // Compute the number of neighbors, store rsq
  int ninside = 0;
  // Looping over ntotal for now.
  for (int j = 0; j < ntotal; j++){
    const F_FLOAT dx = x(j,0) - xtmp;
    const F_FLOAT dy = x(j,1) - ytmp;
    const F_FLOAT dz = x(j,2) - ztmp;
    int jtype = type(j);
    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
    if (rsq < rnd_cutsq(jtype, jtype) ) {
      int icol = size_local_cols_base + jtype - 1;
      d_alocal(igrid, icol) += d_prefacelem(jtype-1) * exp(-rsq * d_argfacelem(jtype-1));
    }
  }
 }
 /* ----------------------------------------------------------------------
   check max team size
 ------------------------------------------------------------------------- */
 template<class DeviceType>
 template<class TagStyle>
 void ComputeGaussianGridLocalKokkos<DeviceType>::check_team_size_for(int inum, int &team_size, int vector_length) {
  int team_size_max;
  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
  if (team_size*vector_length > team_size_max)
    team_size = team_size_max/vector_length;
 }
 namespace LAMMPS_NS {
 template class ComputeGaussianGridLocalKokkos<LMPDeviceType>;
 #ifdef LMP_KOKKOS_GPU
 template class ComputeGaussianGridLocalKokkos<LMPHostType>;
 #endif
 }
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
@ -0,0 +1,96 @@
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef COMPUTE_CLASS
 // clang-format off
 ComputeStyle(gaussian/grid/local/kk,ComputeGaussianGridLocalKokkos<LMPDeviceType>);
 ComputeStyle(gaussian/grid/local/kk/device,ComputeGaussianGridLocalKokkos<LMPDeviceType>);
 ComputeStyle(gaussian/grid/local/kk/host,ComputeGaussianGridLocalKokkos<LMPHostType>);
 // clang-format on
 #else
 #ifndef LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_KOKKOS_H
 #define LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_KOKKOS_H
 #include "compute_gaussian_grid_local.h"
 #include "kokkos_type.h"
 namespace LAMMPS_NS {
 // clang-format off
 struct TagComputeGaussianGridLocalNeigh{};
 // clang-format on
 template <class DeviceType> class ComputeGaussianGridLocalKokkos : public ComputeGaussianGridLocal {
 public:
  typedef DeviceType device_type;
  typedef ArrayTypes<DeviceType> AT;
  // Static team/tile sizes for device offload
 #ifdef KOKKOS_ENABLE_HIP
  static constexpr int team_size_compute_neigh = 2;
 #else
  static constexpr int team_size_compute_neigh = 4;
 #endif
  ComputeGaussianGridLocalKokkos(class LAMMPS *, int, char **);
  ~ComputeGaussianGridLocalKokkos() override;
  void setup() override;
  void init() override;
  void compute_local() override;
  template<class TagStyle>
  void check_team_size_for(int, int&, int);
  KOKKOS_INLINE_FUNCTION
  void operator() (TagComputeGaussianGridLocalNeigh, const typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh>::member_type& team) const;
 private:
  Kokkos::View<double*, DeviceType> d_radelem;              // element radii
  Kokkos::View<double*, DeviceType> d_sigmaelem;
  Kokkos::View<double*, DeviceType> d_prefacelem;
  Kokkos::View<double*, DeviceType> d_argfacelem;
  Kokkos::View<int*, DeviceType> d_ninside;                // ninside for all atoms in list
  Kokkos::View<int*, DeviceType> d_map;                    // mapping from atom types to elements
  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
  tdual_fparams k_cutsq;
  typedef Kokkos::View<const F_FLOAT**, DeviceType,
      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
  t_fparams_rnd rnd_cutsq;
  int max_neighs, inum, chunk_size, chunk_offset;
  int host_flag;
  int total_range; // total number of loop iterations in grid
  int xlen, ylen, zlen;
  int chunksize;
  int ntotal;
  typename AT::t_x_array_randomread x;
  typename AT::t_int_1d_randomread type;
  DAT::tdual_float_2d k_alocal;
  typename AT::t_float_2d d_alocal;
  // triclinic vars
  double h0, h1, h2, h3, h4, h5;
  double lo0, lo1, lo2;
 };
 }    // namespace LAMMPS_NS
 #endif
 #endif
--- a/src/KOKKOS/compute_reaxff_atom_kokkos.cpp
+++ b/src/KOKKOS/compute_reaxff_atom_kokkos.cpp
@ -63,6 +63,8 @@ void ComputeReaxFFAtomKokkos<DeviceType>::init()
 template<class DeviceType>
 void ComputeReaxFFAtomKokkos<DeviceType>::compute_bonds()
 {
  invoked_bonds = update->ntimestep;
  if (atom->nmax > nmax) {
    memory->destroy(array_atom);
    nmax = atom->nmax;
--- a/src/KOKKOS/compute_sna_grid_kokkos.cpp
+++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp
@ -0,0 +1,25 @@
 // clang-format off
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #include "compute_sna_grid_kokkos.h"
 #include "compute_sna_grid_kokkos_impl.h"
 namespace LAMMPS_NS {
 template class ComputeSNAGridKokkosDevice<LMPDeviceType>;
 #ifdef LMP_KOKKOS_GPU
 template class ComputeSNAGridKokkosHost<LMPHostType>;
 #endif
 }
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@ -0,0 +1,297 @@
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef COMPUTE_CLASS
 // clang-format off
 ComputeStyle(sna/grid/kk,ComputeSNAGridKokkosDevice<LMPDeviceType>);
 ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkosDevice<LMPDeviceType>);
 #ifdef LMP_KOKKOS_GPU
 ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosHost<LMPHostType>);
 #else
 ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice<LMPHostType>);
 #endif
 // clang-format on
 #else
 // clang-format off
 #ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H
 #define LMP_COMPUTE_SNA_GRID_KOKKOS_H
 #include "compute_sna_grid.h"
 #include "kokkos_type.h"
 #include "sna_kokkos.h"
 namespace LAMMPS_NS {
 // Routines for both the CPU and GPU backend
 // GPU backend only
 struct TagCSNAGridComputeNeigh{};
 struct TagCSNAGridComputeCayleyKlein{};
 struct TagCSNAGridPreUi{};
 struct TagCSNAGridComputeUiSmall{}; // more parallelism, more divergence
 struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence
 struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
 template <bool chemsnap> struct TagCSNAGridComputeZi{};
 template <bool chemsnap> struct TagCSNAGridComputeBi{};
 struct TagCSNAGridLocalFill{}; // fill the gridlocal array
 struct TagComputeSNAGridLoop{};
 struct TagComputeSNAGrid3D{};
 // CPU backend only
 struct TagComputeSNAGridLoopCPU{};
 //template<class DeviceType>
 template<class DeviceType, typename real_type_, int vector_length_>
 class ComputeSNAGridKokkos : public ComputeSNAGrid {
 public:
  typedef DeviceType device_type;
  typedef ArrayTypes<DeviceType> AT;
  static constexpr int vector_length = vector_length_;
  using real_type = real_type_;
  using complex = SNAComplex<real_type>;
  // Static team/tile sizes for device offload
 #ifdef KOKKOS_ENABLE_HIP
  static constexpr int team_size_compute_neigh = 2;
  static constexpr int tile_size_compute_ck = 2;
  static constexpr int tile_size_pre_ui = 2;
  static constexpr int team_size_compute_ui = 2;
  static constexpr int tile_size_transform_ui = 2;
  static constexpr int tile_size_compute_zi = 2;
  static constexpr int min_blocks_compute_zi = 0; // no minimum bound
  static constexpr int tile_size_compute_bi = 2;
  static constexpr int tile_size_compute_yi = 2;
  static constexpr int min_blocks_compute_yi = 0; // no minimum bound
  static constexpr int team_size_compute_fused_deidrj = 2;
 #else
  static constexpr int team_size_compute_neigh = 4;
  static constexpr int tile_size_compute_ck = 4;
  static constexpr int tile_size_pre_ui = 4;
  static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
  static constexpr int tile_size_transform_ui = 4;
  static constexpr int tile_size_compute_zi = 8;
  static constexpr int tile_size_compute_bi = 4;
  static constexpr int tile_size_compute_yi = 8;
  static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
  // this empirically reduces perf fluctuations from compiler version to compiler version
  static constexpr int min_blocks_compute_zi = 4;
  static constexpr int min_blocks_compute_yi = 4;
 #endif
  // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
  // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
  // and reduces the verbosity of the LaunchBound by hiding the explicit
  // multiplication by vector_length
  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles, min_blocks>, TagComputeSNA>;
  // MDRangePolicy for the 3D grid loop:
  template <class Device, class TagComputeSNA>
  using CSNAGrid3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
  // Testing out team policies
  template <class Device, int num_teams,  class TagComputeSNA>
  using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
  //using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::IndexType<int>, Kokkos::IndexType<int>, Kokkos::IndexType<int>, TagComputeSNA>;
  //using team_member = typename team_policy::member_type;
  // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
  // This hides the LaunchBounds abstraction by hiding the explicit
  // multiplication by vector length
  template <class Device, int num_teams, class TagComputeSNA>
  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
  // Helper routine that returns a CPU or a GPU policy as appropriate
  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
  auto snap_get_policy(const int& chunk_size_div, const int& second_loop) {
    return Snap3DRangePolicy<Device, num_tiles, TagComputeSNA, min_blocks>({0, 0, 0},
                                                                 {vector_length, second_loop, chunk_size_div},
                                                                 {vector_length, num_tiles, 1});
  }
  ComputeSNAGridKokkos(class LAMMPS *, int, char **);
  ~ComputeSNAGridKokkos() override;
  void setup() override;
  void compute_array() override;
  // Utility functions for teams
  template<class TagStyle>
  void check_team_size_for(int, int&);
  template<class TagStyle>
  void check_team_size_reduce(int, int&);
  // operator function for example team policy
  //KOKKOS_INLINE_FUNCTION
  KOKKOS_INLINE_FUNCTION
  void operator() (TagComputeSNAGridLoop, const int& ) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagComputeSNAGridLoopCPU, const int&) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeNeigh>::member_type& team) const;
  // 3D case - used by parallel_for
  KOKKOS_INLINE_FUNCTION
  void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridPreUi, const int& iatom, const int& j) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridPreUi, const int& iatom) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiSmall>::member_type& team) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiLarge>::member_type& team) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridTransformUi, const int& iatom, const int& idxu) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridTransformUi, const int& iatom) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom_mod, const int& idxz, const int& iatom_div) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom, const int& idxz) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom_mod, const int& idxb, const int& iatom_div) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom, const int& idxb) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalFill,const int& ii) const;
 protected:
  SNAKokkos<DeviceType, real_type, vector_length> snaKK;
  int max_neighs, chunk_size, chunk_offset;
  int host_flag;
  int ntotal;
  int total_range; // total number of loop iterations in grid
  int zlen; //= nzhi-nzlo+1;
  int ylen; //= nyhi-nylo+1;
  int xlen; //= nxhi-nxlo+1;
  double cutsq_tmp; // temporary cutsq until we get a view
  Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
  Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
  Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
  Kokkos::View<real_type*, DeviceType> d_sinnerelem;           // element inner cutoff midpoint
  Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
  Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
  Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
  Kokkos::View<real_type*, DeviceType> d_test;              // test view
  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
  tdual_fparams k_cutsq;
  typedef Kokkos::View<const F_FLOAT**, DeviceType,
      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
  t_fparams_rnd rnd_cutsq;
  typename AT::t_x_array_randomread x;
  typename AT::t_int_1d_randomread type;
  DAT::tdual_float_2d k_grid;
  DAT::tdual_float_2d k_gridall;
  typename AT::t_float_2d d_grid;
  typename AT::t_float_2d d_gridall;
  DAT::tdual_float_4d k_gridlocal;
  typename AT::t_float_4d d_gridlocal;
  // Utility routine which wraps computing per-team scratch size requirements for
  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
  template <typename scratch_type>
  int scratch_size_helper(int values_per_team);
  class DomainKokkos *domainKK;
  // triclinic vars
  double h0, h1, h2, h3, h4, h5;
  double lo0, lo1, lo2;
  // Make SNAKokkos a friend
  friend class SNAKokkos<DeviceType, real_type, vector_length>;
 };
 // These wrapper classes exist to make the compute style factory happy/avoid having
 // to extend the compute  style factory to support Compute classes w/an arbitrary number
 // of extra template parameters
 template <class DeviceType>
 class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN> {
 private:
  using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>;
 public:
  ComputeSNAGridKokkosDevice(class LAMMPS *, int, char **);
  void compute_array() override;
 };
 #ifdef LMP_KOKKOS_GPU
 template <class DeviceType>
 class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN> {
 private:
  using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>;
 public:
  ComputeSNAGridKokkosHost(class LAMMPS *, int, char **);
  void compute_array() override;
 };
 #endif
 }
 #endif
 #endif
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@ -0,0 +1,786 @@
 // clang-format off
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Christian Trott (SNL), Stan Moore (SNL),
                         Evan Weinberg (NVIDIA)
 ------------------------------------------------------------------------- */
 #include "compute_sna_grid_kokkos.h"
 #include "pair_snap_kokkos.h"
 #include "atom_kokkos.h"
 #include "atom_masks.h"
 #include "comm.h"
 #include "error.h"
 #include "memory_kokkos.h"
 #include "modify.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "neighbor_kokkos.h"
 #include "domain.h"
 #include "domain_kokkos.h"
 #include "sna.h"
 #include "update.h"
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
 #define MAXLINE 1024
 #define MAXWORD 3
 namespace LAMMPS_NS {
 // Constructor
 template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg)
 {
  kokkosable = 1;
  atomKK = (AtomKokkos *) atom;
  domainKK = (DomainKokkos *) domain;
  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
  datamask_read = EMPTY_MASK;
  datamask_modify = EMPTY_MASK;
  k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
  auto d_cutsq = k_cutsq.template view<DeviceType>();
  rnd_cutsq = d_cutsq;
  host_flag = (execution_space == Host);
  // TODO: Extract cutsq in double loop below, no need for cutsq_tmp
  cutsq_tmp = cutsq[1][1];
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = 1; j <= atom->ntypes; j++){
      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp;
      k_cutsq.template modify<LMPHostType>();
    }
  }
   // Set up element lists
  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
  MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements);
  MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements);
  MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements);
  // test
  MemKK::realloc_kokkos(d_test, "ComputeSNAGridKokkos::test", nelements);
  int n = atom->ntypes;
  MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
  auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
  auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
  auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
  auto h_map = Kokkos::create_mirror_view(d_map);
  // test
  auto h_test = Kokkos::create_mirror_view(d_test);
  h_test(0) = 2.0;
  // start from index 1 because of how compute sna/grid is
  for (int i = 1; i <= atom->ntypes; i++) {
    h_radelem(i-1) = radelem[i];
    h_wjelem(i-1) = wjelem[i];
    if (switchinnerflag){
      h_sinnerelem(i) = sinnerelem[i];
      h_dinnerelem(i) = dinnerelem[i];
    }
  }
  // In pair snap some things like `map` get allocated regardless of chem flag.
  if (chemflag){
    for (int i = 1; i <= atom->ntypes; i++) {
      h_map(i) = map[i];
    }
  }
  Kokkos::deep_copy(d_radelem,h_radelem);
  Kokkos::deep_copy(d_wjelem,h_wjelem);
  if (switchinnerflag){
    Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
    Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
  }
  if (chemflag){
    Kokkos::deep_copy(d_map,h_map);
  }
  Kokkos::deep_copy(d_test,h_test);
  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this);
  snaKK.grow_rij(0,0);
  snaKK.init();
 }
 // Destructor
 template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
 {
  if (copymode) return;
  memoryKK->destroy_kokkos(k_cutsq,cutsq);
  memoryKK->destroy_kokkos(k_gridall, gridall);
 }
 // Setup
 template<class DeviceType, typename real_type, int vector_length>
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
 {
  // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
  // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
  ComputeGrid::set_grid_global();
  ComputeGrid::set_grid_local();
  // allocate arrays
  memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
  // do not use or allocate gridlocal for now
  gridlocal_allocated = 0;
  array = gridall;
  d_gridlocal = k_gridlocal.template view<DeviceType>();
  d_gridall = k_gridall.template view<DeviceType>();
 }
 // Compute
 template<class DeviceType, typename real_type, int vector_length>
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
 {
  if (host_flag) {
    ComputeSNAGrid::compute_array();
    return;
  }
  copymode = 1;
  zlen = nzhi-nzlo+1;
  ylen = nyhi-nylo+1;
  xlen = nxhi-nxlo+1;
  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
  x = atomKK->k_x.view<DeviceType>();
  type = atomKK->k_type.view<DeviceType>();
  k_cutsq.template sync<DeviceType>();
  // max_neighs is defined here - think of more elaborate methods.
  max_neighs = 100;
  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
  // number of atoms.
  ntotal = atomKK->nlocal + atomKK->nghost;
  // Allocate view for number of neighbors per grid point
  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
  // `total_range` is the number of grid points which may be larger than chunk size.
  chunk_size = MIN(chunksize, total_range);
  chunk_offset = 0;
  snaKK.grow_rij(chunk_size, max_neighs);
  // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
  const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
  if (triclinic) {
    h0 = domain->h[0];
    h1 = domain->h[1];
    h2 = domain->h[2];
    h3 = domain->h[3];
    h4 = domain->h[4];
    h5 = domain->h[5];
    lo0 = domain->boxlo[0];
    lo1 = domain->boxlo[1];
    lo2 = domain->boxlo[2];
  }
  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
    if (chunk_size > total_range - chunk_offset)
      chunk_size = total_range - chunk_offset;
    //ComputeNeigh
    {
      int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh>
        policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
      policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
    }
    //ComputeCayleyKlein
    {
      // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
      Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
        policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
      Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
    }
    //PreUi
    {
      auto policy_pre_ui = snap_get_policy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>(chunk_size_div, twojmax + 1);
      Kokkos::parallel_for("PreUi", policy_pre_ui, *this);
    }
    // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
    {
      // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
      // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
      const int tile_size = vector_length * (twojmax + 1);
      const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
      if (chunk_size < parallel_thresh)
      {
        // Version with parallelism over j_bend
        // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
        const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiSmall>
          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
        Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
      } else {
        // Version w/out parallelism over j_bend
        // total number of teams needed: (natoms / 32) * (ntotal)
        const int n_teams = chunk_size_div * max_neighs;
        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiLarge>
          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
        Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
      }
    }
    //TransformUi: un-"fold" ulisttot, zero ylist
    {
      // Expand ulisttot_re,_im -> ulisttot
      // Zero out ylist
      auto policy_transform_ui = snap_get_policy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>(chunk_size_div, snaKK.idxu_max);
      Kokkos::parallel_for("TransformUi", policy_transform_ui, *this);
    }
    //Compute bispectrum
    // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h`
    //ComputeZi and Bi
    if (nelements > 1) {
      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi<true>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
      Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this);
      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi<true>>(chunk_size_div, snaKK.idxb_max);
      Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this);
    } else {
      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi<false>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
      Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this);
      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi<false>>(chunk_size_div, snaKK.idxb_max);
      Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this);
    }
    // Fill the grid array with bispectrum values
    {
      typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocalFill> policy_fill(0,chunk_size);
      Kokkos::parallel_for(policy_fill, *this);
    }
    // Proceed to the next chunk.
    chunk_offset += chunk_size;
  } // end while
  copymode = 0;
  k_gridlocal.template modify<DeviceType>();
  k_gridlocal.template sync<LMPHostType>();
  k_gridall.template modify<DeviceType>();
  k_gridall.template sync<LMPHostType>();
 }
 /* ----------------------------------------------------------------------
   Begin routines that are unique to the GPU codepath. These take advantage
   of AoSoA data layouts and scratch memory for recursive polynomials
 ------------------------------------------------------------------------- */
 /*
 Simple team policy functor seeing how many layers deep we can go with the parallelism.
 */
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
  // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
  // Main difference is that we don't use the neighbor class or neighbor variables here.
  // This is because the grid points are not atoms and therefore do not get assigned
  // neighbors in LAMMPS.
  // TODO: If we did make a neighborlist for each grid point, we could use current
  //       routines and avoid having to loop over all atoms (which limits us to
  //       natoms = max team size).
  // basic quantities associated with this team:
  // team_rank : rank of thread in this team
  // league_rank : rank of team in this league
  // team_size : number of threads in this team
  // extract loop index
  int ii = team.team_rank() + team.league_rank() * team.team_size();
  if (ii >= chunk_size) return;
  // extract grid index
  int igrid = ii + chunk_offset;
  // get a pointer to scratch memory
  // This is used to cache whether or not an atom is within the cutoff.
  // If it is, type_cache is assigned to the atom type.
  // If it's not, it's assigned to -1.
  //const int tile_size = ntotal; //max_neighs; // number of elements per thread
  //const int team_rank = team.team_rank();
  //const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
  //int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
  // convert to grid indices
  int iz = igrid/(xlen*ylen);
  int i2 = igrid - (iz*xlen*ylen);
  int iy = i2/xlen;
  int ix = i2 % xlen;
  iz += nzlo;
  iy += nylo;
  ix += nxlo;
  double xgrid[3];
  // index ii already captures the proper grid point
  //int igrid = iz * (nx * ny) + iy * nx + ix;
  // grid2x converts igrid to ix,iy,iz like we've done before
  // multiply grid integers by grid spacing delx, dely, delz
  //grid2x(igrid, xgrid);
  xgrid[0] = ix * delx;
  xgrid[1] = iy * dely;
  xgrid[2] = iz * delz;
  if (triclinic) {
    // Do a conversion on `xgrid` here like we do in the CPU version.
    // Can't do this:
    // domainKK->lamda2x(xgrid, xgrid);
    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
    xgrid[2] = h2*xgrid[2] + lo2;
  }
  const F_FLOAT xtmp = xgrid[0];
  const F_FLOAT ytmp = xgrid[1];
  const F_FLOAT ztmp = xgrid[2];
  // currently, all grid points are type 1
  // not clear what a better choice would be
  const int itype = 1;
  int ielem = 0;
  if (chemflag) ielem = d_map[itype];
  //const double radi = d_radelem[ielem];
  // Compute the number of neighbors, store rsq
  int ninside = 0;
  // Looping over ntotal for now.
  for (int j = 0; j < ntotal; j++){
    const F_FLOAT dx = x(j,0) - xtmp;
    const F_FLOAT dy = x(j,1) - ytmp;
    const F_FLOAT dz = x(j,2) - ztmp;
    int jtype = type(j);
    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
    // don't include atoms that share location with grid point
    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
      jtype = -1; // use -1 to signal it's outside the radius
    }
    if (jtype >= 0)
      ninside++;
  }
  d_ninside(ii) = ninside;
  // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
  int offset = 0;
  for (int j = 0; j < ntotal; j++){
    //const int jtype = type_cache[j];
    //if (jtype >= 0) {
    const F_FLOAT dx = x(j,0) - xtmp;
    const F_FLOAT dy = x(j,1) - ytmp;
    const F_FLOAT dz = x(j,2) - ztmp;
    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
    int jtype = type(j);
    if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) {
      int jelem = 0;
      if (chemflag) jelem = d_map[jtype];
      snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
      snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
      snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
      // actually since the views here have values starting at 0, let's use jelem
      snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
      snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
      snaKK.inside(ii,offset) = j;
      if (switchinnerflag) {
        snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
        snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
      }
      if (chemflag)
        snaKK.element(ii,offset) = jelem;
      else
        snaKK.element(ii,offset) = 0;
      offset++;
    }
  }
 }
 /* ----------------------------------------------------------------------
  Pre-compute the Cayley-Klein parameters for reuse in later routines
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
  const int iatom = iatom_mod + iatom_div * vector_length;
  if (iatom >= chunk_size) return;
  const int ninside = d_ninside(iatom);
  if (jnbor >= ninside) return;
  snaKK.compute_cayley_klein(iatom, jnbor);
 }
 /* ----------------------------------------------------------------------
  Initialize the "ulisttot" structure with non-zero on-diagonal terms
  and zero terms elsewhere
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const {
  const int iatom = iatom_mod + iatom_div * vector_length;
  if (iatom >= chunk_size) return;
  int itype = type(iatom);
  int ielem = d_map[itype];
  snaKK.pre_ui(iatom, j, ielem);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom, const int& j) const {
  if (iatom >= chunk_size) return;
  int itype = type(iatom);
  int ielem = d_map[itype];
  snaKK.pre_ui(iatom, j, ielem);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom) const {
  if (iatom >= chunk_size) return;
  const int itype = type(iatom);
  const int ielem = d_map[itype];
  for (int j = 0; j <= twojmax; j++)
    snaKK.pre_ui(iatom, j, ielem);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiSmall>::member_type& team) const {
  // extract flattened atom_div / neighbor number / bend location
  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
  // extract neighbor index, iatom_div
  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
  const int jbend = jj_jbend / max_neighs;
  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
    [&] (const int iatom_mod) {
    const int ii = iatom_mod + vector_length * iatom_div;
    if (ii >= chunk_size) return;
    const int ninside = d_ninside(ii);
    if (jj >= ninside) return;
    snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
  });
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiLarge>::member_type& team) const {
  // extract flattened atom_div / neighbor number / bend location
  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
  // extract neighbor index, iatom_div
  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
  int jj = flattened_idx - iatom_div * max_neighs;
  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
    [&] (const int iatom_mod) {
    const int ii = iatom_mod + vector_length * iatom_div;
    if (ii >= chunk_size) return;
    const int ninside = d_ninside(ii);
    if (jj >= ninside) return;
    snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
  });
 }
 /* ----------------------------------------------------------------------
  De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot
  structure. Zero-initialize ylist. CPU and GPU.
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const {
  const int iatom = iatom_mod + iatom_div * vector_length;
  if (iatom >= chunk_size) return;
  if (idxu >= snaKK.idxu_max) return;
  snaKK.transform_ui(iatom, idxu);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom, const int& idxu) const {
  if (iatom >= chunk_size) return;
  snaKK.transform_ui(iatom, idxu);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom) const {
  if (iatom >= chunk_size) return;
  for (int idxu = 0; idxu < snaKK.idxu_max; idxu++)
    snaKK.transform_ui(iatom, idxu);
 }
 /* ----------------------------------------------------------------------
  Compute all elements of the Z tensor and store them into the `zlist`
   view
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom_mod, const int& jjz, const int& iatom_div) const {
  const int iatom = iatom_mod + iatom_div * vector_length;
  if (iatom >= chunk_size) return;
  if (jjz >= snaKK.idxz_max) return;
  snaKK.template compute_zi<chemsnap>(iatom, jjz);
 }
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom, const int& jjz) const {
  if (iatom >= chunk_size) return;
  snaKK.template compute_zi<chemsnap>(iatom, jjz);
 }
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom) const {
  if (iatom >= chunk_size) return;
  for (int jjz = 0; jjz < snaKK.idxz_max; jjz++)
    snaKK.template compute_zi<chemsnap>(iatom, jjz);
 }
 /* ----------------------------------------------------------------------
  Compute the energy triple products and store in the "blist" view
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom_mod, const int& jjb, const int& iatom_div) const {
  const int iatom = iatom_mod + iatom_div * vector_length;
  if (iatom >= chunk_size) return;
  if (jjb >= snaKK.idxb_max) return;
  snaKK.template compute_bi<chemsnap>(iatom, jjb);
 }
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom, const int& jjb) const {
  if (iatom >= chunk_size) return;
  snaKK.template compute_bi<chemsnap>(iatom, jjb);
 }
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom) const {
  if (iatom >= chunk_size) return;
  for (int jjb = 0; jjb < snaKK.idxb_max; jjb++)
    snaKK.template compute_bi<chemsnap>(iatom, jjb);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill, const int& ii) const {
  // extract grid index
  int igrid = ii + chunk_offset;
  // convert to grid indices
  int iz = igrid/(xlen*ylen);
  int i2 = igrid - (iz*xlen*ylen);
  int iy = i2/xlen;
  int ix = i2 % xlen;
  iz += nzlo;
  iy += nylo;
  ix += nxlo;
  double xgrid[3];
  // index ii already captures the proper grid point
  // int igrid = iz * (nx * ny) + iy * nx + ix;
  // printf("ii igrid: %d %d\n", ii, igrid);
  // grid2x converts igrid to ix,iy,iz like we've done before
  //grid2x(igrid, xgrid);
  xgrid[0] = ix * delx;
  xgrid[1] = iy * dely;
  xgrid[2] = iz * delz;
  if (triclinic) {
    // Do a conversion on `xgrid` here like we do in the CPU version.
    // Can't do this:
    // domainKK->lamda2x(xgrid, xgrid);
    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
    xgrid[2] = h2*xgrid[2] + lo2;
  }
  const F_FLOAT xtmp = xgrid[0];
  const F_FLOAT ytmp = xgrid[1];
  const F_FLOAT ztmp = xgrid[2];
  d_gridall(igrid,0) = xtmp;
  d_gridall(igrid,1) = ytmp;
  d_gridall(igrid,2) = ztmp;
  const auto idxb_max = snaKK.idxb_max;
  // linear contributions
  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
    const auto idxb = icoeff % idxb_max;
    const auto idx_chem = icoeff / idxb_max;
    d_gridall(igrid,icoeff+3) = snaKK.blist(ii,idx_chem,idxb);
  }
 }
 /* ----------------------------------------------------------------------
   utility functions
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 template<class TagStyle>
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
  int team_size_max;
  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
  if (team_size*vector_length > team_size_max)
    team_size = team_size_max/vector_length;
 }
 template<class DeviceType, typename real_type, int vector_length>
 template<class TagStyle>
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
  int team_size_max;
  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
  if (team_size*vector_length > team_size_max)
    team_size = team_size_max/vector_length;
 }
 template<class DeviceType, typename real_type, int vector_length>
 template<typename scratch_type>
 int ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::scratch_size_helper(int values_per_team) {
  typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
  return ScratchViewType::shmem_size(values_per_team);
 }
 /* ---------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   routines used by template reference classes
 ------------------------------------------------------------------------- */
 template<class DeviceType>
 ComputeSNAGridKokkosDevice<DeviceType>::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
   : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
 template<class DeviceType>
 void ComputeSNAGridKokkosDevice<DeviceType>::compute_array()
 {
  Base::compute_array();
 }
 #ifdef LMP_KOKKOS_GPU
 template<class DeviceType>
 ComputeSNAGridKokkosHost<DeviceType>::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg)
   : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp, narg, arg) { ; }
 template<class DeviceType>
 void ComputeSNAGridKokkosHost<DeviceType>::compute_array()
 {
  Base::compute_array();
 }
 #endif
 }
--- a/src/KOKKOS/compute_sna_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.cpp
@ -0,0 +1,25 @@
 // clang-format off
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #include "compute_sna_grid_local_kokkos.h"
 #include "compute_sna_grid_local_kokkos_impl.h"
 namespace LAMMPS_NS {
 template class ComputeSNAGridLocalKokkosDevice<LMPDeviceType>;
 #ifdef LMP_KOKKOS_GPU
 template class ComputeSNAGridLocalKokkosHost<LMPHostType>;
 #endif
 }
--- a/src/KOKKOS/compute_sna_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h
@ -0,0 +1,288 @@
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef COMPUTE_CLASS
 // clang-format off
 ComputeStyle(sna/grid/local/kk,ComputeSNAGridLocalKokkosDevice<LMPDeviceType>);
 ComputeStyle(sna/grid/local/kk/device,ComputeSNAGridLocalKokkosDevice<LMPDeviceType>);
 #ifdef LMP_KOKKOS_GPU
 ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosHost<LMPHostType>);
 #else
 ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosDevice<LMPHostType>);
 #endif
 // clang-format on
 #else
 // clang-format off
 #ifndef LMP_COMPUTE_SNA_GRID_LOCAL_KOKKOS_H
 #define LMP_COMPUTE_SNA_GRID_LOCAL_KOKKOS_H
 #include "compute_sna_grid_local.h"
 #include "kokkos_type.h"
 #include "sna_kokkos.h"
 namespace LAMMPS_NS {
 // Routines for both the CPU and GPU backend
 // GPU backend only
 struct TagCSNAGridLocalComputeNeigh{};
 struct TagCSNAGridLocalComputeCayleyKlein{};
 struct TagCSNAGridLocalPreUi{};
 struct TagCSNAGridLocalComputeUiSmall{}; // more parallelism, more divergence
 struct TagCSNAGridLocalComputeUiLarge{}; // less parallelism, no divergence
 struct TagCSNAGridLocalTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
 template <bool chemsnap> struct TagCSNAGridLocalComputeZi{};
 template <bool chemsnap> struct TagCSNAGridLocalComputeBi{};
 struct TagCSNAGridLocal2Fill{}; // fill the gridlocal array
 struct TagComputeSNAGridLocalLoop{};
 struct TagComputeSNAGridLocal3D{};
 // CPU backend only
 struct TagComputeSNAGridLocalLoopCPU{};
 //template<class DeviceType>
 template<class DeviceType, typename real_type_, int vector_length_>
 class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
 public:
  typedef DeviceType device_type;
  typedef ArrayTypes<DeviceType> AT;
  static constexpr int vector_length = vector_length_;
  using real_type = real_type_;
  using complex = SNAComplex<real_type>;
  // Static team/tile sizes for device offload
 #ifdef KOKKOS_ENABLE_HIP
  static constexpr int team_size_compute_neigh = 2;
  static constexpr int tile_size_compute_ck = 2;
  static constexpr int tile_size_pre_ui = 2;
  static constexpr int team_size_compute_ui = 2;
  static constexpr int tile_size_transform_ui = 2;
  static constexpr int tile_size_compute_zi = 2;
  static constexpr int min_blocks_compute_zi = 0; // no minimum bound
  static constexpr int tile_size_compute_bi = 2;
  static constexpr int tile_size_compute_yi = 2;
  static constexpr int min_blocks_compute_yi = 0; // no minimum bound
  static constexpr int team_size_compute_fused_deidrj = 2;
 #else
  static constexpr int team_size_compute_neigh = 4;
  static constexpr int tile_size_compute_ck = 4;
  static constexpr int tile_size_pre_ui = 4;
  static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
  static constexpr int tile_size_transform_ui = 4;
  static constexpr int tile_size_compute_zi = 8;
  static constexpr int tile_size_compute_bi = 4;
  static constexpr int tile_size_compute_yi = 8;
  static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
  // this empirically reduces perf fluctuations from compiler version to compiler version
  static constexpr int min_blocks_compute_zi = 4;
  static constexpr int min_blocks_compute_yi = 4;
 #endif
  // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
  // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
  // and reduces the verbosity of the LaunchBound by hiding the explicit
  // multiplication by vector_length
  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles, min_blocks>, TagComputeSNA>;
  // MDRangePolicy for the 3D grid loop:
  template <class Device, class TagComputeSNA>
  using CSNAGridLocal3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
  // Testing out team policies
  template <class Device, int num_teams,  class TagComputeSNA>
  using CSNAGridLocalTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
  // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
  // This hides the LaunchBounds abstraction by hiding the explicit
  // multiplication by vector length
  template <class Device, int num_teams, class TagComputeSNA>
  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
  // Helper routine that returns a CPU or a GPU policy as appropriate
  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
  auto snap_get_policy(const int& chunk_size_div, const int& second_loop) {
    return Snap3DRangePolicy<Device, num_tiles, TagComputeSNA, min_blocks>({0, 0, 0},
                                                                 {vector_length, second_loop, chunk_size_div},
                                                                 {vector_length, num_tiles, 1});
  }
  ComputeSNAGridLocalKokkos(class LAMMPS *, int, char **);
  ~ComputeSNAGridLocalKokkos() override;
  void setup() override;
  void compute_local() override;
  // Utility functions for teams
  template<class TagStyle>
  void check_team_size_for(int, int&);
  template<class TagStyle>
  void check_team_size_reduce(int, int&);
  KOKKOS_INLINE_FUNCTION
  void operator() (TagComputeSNAGridLocalLoop, const int& ) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagComputeSNAGridLocalLoopCPU, const int&) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeNeigh>::member_type& team) const;
  // 3D case - used by parallel_for
  KOKKOS_INLINE_FUNCTION
  void operator()(TagComputeSNAGridLocal3D, const int& iz, const int& iy, const int& ix) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalPreUi, const int& iatom, const int& j) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalPreUi, const int& iatom) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeUiSmall>::member_type& team) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeUiLarge>::member_type& team) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalTransformUi, const int& iatom, const int& idxu) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalTransformUi, const int& iatom) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom_mod, const int& idxz, const int& iatom_div) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom, const int& idxz) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom_mod, const int& idxb, const int& iatom_div) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom, const int& idxb) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocal2Fill,const int& ii) const;
 protected:
  SNAKokkos<DeviceType, real_type, vector_length> snaKK;
  int max_neighs, chunk_size, chunk_offset;
  int host_flag;
  int ntotal;
  int total_range; // total number of loop iterations in grid
  int zlen; //= nzhi-nzlo+1;
  int ylen; //= nyhi-nylo+1;
  int xlen; //= nxhi-nxlo+1;
  double cutsq_tmp; // temporary cutsq until we get a view
  Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
  Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
  Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
  Kokkos::View<real_type*, DeviceType> d_sinnerelem;           // element inner cutoff midpoint
  Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
  Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
  Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
  Kokkos::View<real_type*, DeviceType> d_test;              // test view
  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
  tdual_fparams k_cutsq;
  typedef Kokkos::View<const F_FLOAT**, DeviceType,
      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
  t_fparams_rnd rnd_cutsq;
  typename AT::t_x_array_randomread x;
  typename AT::t_int_1d_randomread type;
  DAT::tdual_float_2d k_alocal;
  typename AT::t_float_2d d_alocal;
  // Utility routine which wraps computing per-team scratch size requirements for
  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
  template <typename scratch_type>
  int scratch_size_helper(int values_per_team);
  class DomainKokkos *domainKK;
  // triclinic vars
  double h0, h1, h2, h3, h4, h5;
  double lo0, lo1, lo2;
  // Make SNAKokkos a friend
  friend class SNAKokkos<DeviceType, real_type, vector_length>;
 };
 // These wrapper classes exist to make the compute style factory happy/avoid having
 // to extend the compute  style factory to support Compute classes w/an arbitrary number
 // of extra template parameters
 template <class DeviceType>
 class ComputeSNAGridLocalKokkosDevice : public ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN> {
 private:
  using Base = ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>;
 public:
  ComputeSNAGridLocalKokkosDevice(class LAMMPS *, int, char **);
  void compute_local() override;
 };
 #ifdef LMP_KOKKOS_GPU
 template <class DeviceType>
 class ComputeSNAGridLocalKokkosHost : public ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN> {
 private:
  using Base = ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>;
 public:
  ComputeSNAGridLocalKokkosHost(class LAMMPS *, int, char **);
  void compute_local() override;
 };
 #endif
 }
 #endif
 #endif
--- a/Show More
+++ b/Show More