Merge remote-tracking branch 'github/develop' into add-error-explanations

2025-01-31 03:23:51 -05:00
parent 00054a8d97 ea1607f1d8
commit 42e379a8de
236 changed files with 15318 additions and 8091 deletions
--- a/.github/workflows/style-check.yml
+++ b/.github/workflows/style-check.yml
@ -35,3 +35,4 @@ jobs:
         make check-permissions
         make check-homepage
         make check-errordocs
+         make check-fmtlib
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -98,23 +98,26 @@ check_for_autogen_files(${LAMMPS_SOURCE_DIR})
 #####################################################################
 include(CheckIncludeFileCXX)

-# set required compiler flags, apply checks, and compiler/CPU arch specific optimizations
+# set required compiler flags and compiler/CPU arch specific optimizations
 if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-  # Intel classic compilers version 19 are broken and fail to compile the embedded fmtlib
-  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 20.0)
-    message(ERROR "Intel classic compiler version ${CMAKE_CXX_COMPILER_VERSION} is too old")
-  endif()
-
  if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
    if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Qrestrict")
    endif()
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.3 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.4)
+      set(CMAKE_TUNE_DEFAULT "/QxCOMMON-AVX512")
+    else()
      set(CMAKE_TUNE_DEFAULT "/QxHost")
+    endif()
  else()
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict")
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.3 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.4)
+      set(CMAKE_TUNE_DEFAULT "-xCOMMON-AVX512")
+    else()
      set(CMAKE_TUNE_DEFAULT "-xHost -fp-model fast=2 -no-prec-div -qoverride-limits -diag-disable=10441 -diag-disable=11074 -diag-disable=11076 -diag-disable=2196")
    endif()
  endif()
+endif()

 # silence excessive warnings for new Intel Compilers
 if(CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
--- a/cmake/Modules/Packages/PLUMED.cmake
+++ b/cmake/Modules/Packages/PLUMED.cmake
@ -32,9 +32,9 @@ endif()

 # Note: must also adjust check for supported API versions in
 # fix_plumed.cpp when version changes from v2.n.x to v2.n+1.y
-set(PLUMED_URL "https://github.com/plumed/plumed2/releases/download/v2.9.2/plumed-src-2.9.2.tgz"
+set(PLUMED_URL "https://github.com/plumed/plumed2/releases/download/v2.9.3/plumed-src-2.9.3.tgz"
  CACHE STRING "URL for PLUMED tarball")
-set(PLUMED_MD5 "04862602a372c1013bdfee2d6d03bace" CACHE STRING "MD5 checksum of PLUMED tarball")
+set(PLUMED_MD5 "ee1249805fe94bccee17d10610d3f6f1" CACHE STRING "MD5 checksum of PLUMED tarball")

 mark_as_advanced(PLUMED_URL)
 mark_as_advanced(PLUMED_MD5)
--- a/doc/src/Build_basics.rst
+++ b/doc/src/Build_basics.rst
@ -196,13 +196,18 @@ LAMMPS.

   .. tab:: CMake build

-      By default CMake will use the compiler it finds according to
+      By default CMake will use the compiler it finds according to its
      internal preferences, and it will add optimization flags
      appropriate to that compiler and any :doc:`accelerator packages
      <Speed_packages>` you have included in the build.  CMake will
      check if the detected or selected compiler is compatible with the
      C++ support requirements of LAMMPS and stop with an error, if this
-      is not the case.
+      is not the case.  A C++11 compatible compiler is currently
+      required, but a transition to require C++17 is in progress and
+      planned to be completed in Summer 2025. Currently, setting
+      ``-DLAMMPS_CXX11=yes`` is required when configuring with CMake while
+      using a C++11 compatible compiler that does not support C++17,
+      otherwise setting ``-DCMAKE_CXX_STANDARD=17`` is preferred.

      You can tell CMake to look for a specific compiler with setting
      CMake variables (listed below) during configuration.  For a few
@ -223,6 +228,8 @@ LAMMPS.
         -D CMAKE_C_COMPILER=name              # name of C compiler
         -D CMAKE_Fortran_COMPILER=name        # name of Fortran compiler

+         -D CMAKE_CXX_STANDARD=17              # put compiler in C++17 mode
+         -D LAMMPS_CXX11=yes                   # enforce compilation in C++11 mode
         -D CMAKE_CXX_FLAGS=string             # flags to use with C++ compiler
         -D CMAKE_C_FLAGS=string               # flags to use with C compiler
         -D CMAKE_Fortran_FLAGS=string         # flags to use with Fortran compiler
@ -321,6 +328,14 @@ LAMMPS.
         you would have to install a newer compiler that supports C++11;
         either as a binary package or through compiling from source.

+      While a C++11 compatible compiler is currently sufficient to compile
+      LAMMPS, a transition to require C++17 is in progress and planned to
+      be completed in Summer 2025. Currently, setting ``-DLAMMPS_CXX11``
+      in the ``LMP_INC =`` line in the machine makefile is required when
+      using a C++11 compatible compiler that does not support C++17.
+      Otherwise, to enable C++17 support (if not enabled by default) using
+      a compiler flag like ``-std=c++17`` in CCFLAGS may needed.
+
      If you build LAMMPS with any :doc:`Speed_packages` included,
      there may be specific compiler or linker flags that are either
      required or recommended to enable required features and to
--- a/doc/src/Commands_compute.rst
+++ b/doc/src/Commands_compute.rst
@ -58,6 +58,7 @@ KOKKOS, o = OPENMP, t = OPT.
   * :doc:`fep/ta <compute_fep_ta>`
   * :doc:`force/tally <compute_tally>`
   * :doc:`fragment/atom <compute_cluster_atom>`
+   * :doc:`gaussian/grid/local (k) <compute_gaussian_grid_local>`
   * :doc:`global/atom <compute_global_atom>`
   * :doc:`group/group <compute_group_group>`
   * :doc:`gyration <compute_gyration>`
@ -140,8 +141,8 @@ KOKKOS, o = OPENMP, t = OPT.
   * :doc:`smd/vol <compute_smd_vol>`
   * :doc:`snap <compute_sna_atom>`
   * :doc:`sna/atom <compute_sna_atom>`
-   * :doc:`sna/grid <compute_sna_atom>`
-   * :doc:`sna/grid/local <compute_sna_atom>`
+   * :doc:`sna/grid (k) <compute_sna_atom>`
+   * :doc:`sna/grid/local (k) <compute_sna_atom>`
   * :doc:`snad/atom <compute_sna_atom>`
   * :doc:`snav/atom <compute_sna_atom>`
   * :doc:`sph/e/atom <compute_sph_e_atom>`
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@ -115,7 +115,9 @@ OPT.
   * :doc:`gw/zbl <pair_gw>`
   * :doc:`harmonic/cut (o) <pair_harmonic_cut>`
   * :doc:`hbond/dreiding/lj (o) <pair_hbond_dreiding>`
+   * :doc:`hbond/dreiding/lj/angleoffset (o) <pair_hbond_dreiding>`
   * :doc:`hbond/dreiding/morse (o) <pair_hbond_dreiding>`
+   * :doc:`hbond/dreiding/morse/angleoffset (o) <pair_hbond_dreiding>`
   * :doc:`hdnnp <pair_hdnnp>`
   * :doc:`hippo (g) <pair_amoeba>`
   * :doc:`ilp/graphene/hbn (t) <pair_ilp_graphene_hbn>`
--- a/doc/src/Developer_code_design.rst
+++ b/doc/src/Developer_code_design.rst
@ -203,6 +203,7 @@ processed in the expected order before types are removed from dynamic
 dispatch.

 .. admonition:: Important Notes
+   :class: note

   In order to be able to detect incompatibilities at compile time and
   to avoid unexpected behavior, it is crucial that all member functions
@ -300,18 +301,24 @@ Formatting with the {fmt} library

 The LAMMPS source code includes a copy of the `{fmt} library
 <https://fmt.dev>`_, which is preferred over formatting with the
-"printf()" family of functions.  The primary reason is that it allows
-a typesafe default format for any type of supported data.  This is
+"printf()" family of functions.  The primary reason is that it allows a
+typesafe default format for any type of supported data.  This is
 particularly useful for formatting integers of a given size (32-bit or
-64-bit) which may require different format strings depending on
-compile time settings or compilers/operating systems.  Furthermore,
-{fmt} gives better performance, has more functionality, a familiar
-formatting syntax that has similarities to ``format()`` in Python, and
-provides a facility that can be used to integrate format strings and a
-variable number of arguments into custom functions in a much simpler
-way than the varargs mechanism of the C library.  Finally, {fmt} has
-been included into the C++20 language standard, so changes to adopt it
-are future-proof.
+64-bit) which may require different format strings depending on compile
+time settings or compilers/operating systems.  Furthermore, {fmt} gives
+better performance, has more functionality, a familiar formatting syntax
+that has similarities to ``format()`` in Python, and provides a facility
+that can be used to integrate format strings and a variable number of
+arguments into custom functions in a much simpler way than the varargs
+mechanism of the C library.  Finally, {fmt} has been included into the
+C++20 language standard as ``std::format()``, so changes to adopt it are
+future-proof, for as long as they are not using any extensions that are
+not (yet) included into C++.
+
+The long-term plan is to switch to using ``std::format()`` instead of
+``fmt::format()`` when the minimum C++ standard required for LAMMPS will
+be set to C++20. See the :ref:`basic build instructions <compile>` for
+more details.

 Formatted strings are frequently created by calling the
 ``fmt::format()`` function, which will return a string as a
@ -319,11 +326,13 @@ Formatted strings are frequently created by calling the
 ``printf()``, the {fmt} library uses ``{}`` to embed format descriptors.
 In the simplest case, no additional characters are needed, as {fmt} will
 choose the default format based on the data type of the argument.
-Otherwise, the ``fmt::print()`` function may be used instead of
-``printf()`` or ``fprintf()``.  In addition, several LAMMPS output
-functions, that originally accepted a single string as argument have
-been overloaded to accept a format string with optional arguments as
-well (e.g., ``Error::all()``, ``Error::one()``, ``utils::logmesg()``).
+Otherwise, the :cpp:func:`utils::print() <LAMMPS_NS::utils::print>`
+function may be used instead of ``printf()`` or ``fprintf()``.  In
+addition, several LAMMPS output functions, that originally accepted a
+single string as argument have been overloaded to accept a format string
+with optional arguments as well (e.g., ``Error::all()``,
+``Error::one()``, :cpp:func:`utils::logmesg()
+<LAMMPS_NS::utils::logmesg>`).

 Summary of the {fmt} format syntax
 ==================================
--- a/doc/src/Developer_flow.rst
+++ b/doc/src/Developer_flow.rst
@ -209,7 +209,7 @@ nve, nvt, npt.

 At the end of the timestep, fixes that contain an ``end_of_step()``
 method are invoked.  These typically perform a diagnostic calculation,
-e.g. the ave/time and ave/spatial fixes.  The final operation of the
+e.g. the ave/time and ave/chunk fixes.  The final operation of the
 timestep is to perform any requested output, via the ``write()`` method
 of the Output class.  There are 3 kinds of LAMMPS output: thermodynamic
 output to the screen and log file, snapshots of atom data to a dump
--- a/doc/src/Developer_utils.rst
+++ b/doc/src/Developer_utils.rst
@ -238,6 +238,12 @@ Convenience functions
 .. doxygenfunction:: logmesg(LAMMPS *lmp, const std::string &mesg)
   :project: progguide

+.. doxygenfunction:: print(FILE *fp, const std::string &format, Args&&... args)
+   :project: progguide
+
+.. doxygenfunction:: print(FILE *fp, const std::string &mesg)
+   :project: progguide
+
 .. doxygenfunction:: errorurl
   :project: progguide

--- a/doc/src/Developer_write_fix.rst
+++ b/doc/src/Developer_write_fix.rst
@ -96,7 +96,7 @@ Here the we specify which methods of the fix should be called during
     MPI_Allreduce(localAvgVel, globalAvgVel, 4, MPI_DOUBLE, MPI_SUM, world);
     scale3(1.0 / globalAvgVel[3], globalAvgVel);
     if ((comm->me == 0) && screen) {
-       fmt::print(screen,"{}, {}, {}\n",
+       utils::print(screen, "{}, {}, {}\n",
                    globalAvgVel[0], globalAvgVel[1], globalAvgVel[2]);
     }
   }
--- a/doc/src/Fortran.rst
+++ b/doc/src/Fortran.rst
@ -323,6 +323,12 @@ of the contents of the :f:mod:`LIBLAMMPS` Fortran interface to LAMMPS.
   :ftype set_internal_variable: subroutine
   :f eval: :f:func:`eval`
   :ftype eval: function
+   :f clearstep_compute: :f:subr:`clearstep_compute`
+   :ftype clearstep_compute: subroutine
+   :f addstep_compute: :f:subr:`addstep_compute`
+   :ftype addstep_compute: subroutine
+   :f addstep_compute_all: :f:subr:`addstep_compute_all`
+   :ftype addstep_compute_all: subroutine
   :f gather_atoms: :f:subr:`gather_atoms`
   :ftype gather_atoms: subroutine
   :f gather_atoms_concat: :f:subr:`gather_atoms_concat`
@ -956,6 +962,7 @@ Procedures Bound to the :f:type:`lammps` Derived Type
      :f:func:`extract_atom` between runs.

   .. admonition:: Array index order
+      :class: tip

      Two-dimensional arrays returned from :f:func:`extract_atom` will be
      **transposed** from equivalent arrays in C, and they will be indexed
@ -1068,6 +1075,7 @@ Procedures Bound to the :f:type:`lammps` Derived Type
   you based on data from the :cpp:class:`Compute` class.

   .. admonition:: Array index order
+      :class: tip

      Two-dimensional arrays returned from :f:func:`extract_compute` will be
      **transposed** from equivalent arrays in C, and they will be indexed
@ -1326,6 +1334,7 @@ Procedures Bound to the :f:type:`lammps` Derived Type
   :rtype data: polymorphic

   .. admonition:: Array index order
+      :class: tip

      Two-dimensional global, per-atom, or local array data from
      :f:func:`extract_fix` will be **transposed** from equivalent arrays in
@ -1450,11 +1459,62 @@ Procedures Bound to the :f:type:`lammps` Derived Type
   an internal-style variable, an error is generated.

   :p character(len=*) name: name of the variable
-   :p read(c_double) val:  new value to assign to the variable
+   :p real(c_double) val:  new value to assign to the variable
   :to: :cpp:func:`lammps_set_internal_variable`

 --------

+.. f:function:: eval(expr)
+
+   This function is a wrapper around :cpp:func:`lammps_eval` that takes a
+   LAMMPS equal style variable string, evaluates it and returns the resulting
+   scalar value as a floating-point number.
+
+   .. versionadded:: TBD
+
+   :p character(len=\*) expr: string to be evaluated
+   :to: :cpp:func:`lammps_eval`
+   :r value [real(c_double)]: result of the evaluated string
+
+--------
+
+.. f:subroutine:: clearstep_compute()
+
+   Clear whether a compute has been invoked
+
+   .. versionadded:: TBD
+
+   :to: :cpp:func:`lammps_clearstep_compute`
+
+--------
+
+.. f:subroutine:: addstep_compute(nextstep)
+
+   Add timestep to list of future compute invocations
+   if the compute has been invoked on the current timestep
+
+   .. versionadded:: TBD
+
+   overloaded for 32-bit and 64-bit integer arguments
+
+   :p integer(kind=8 or kind=4) nextstep: next timestep
+   :to: :cpp:func:`lammps_addstep_compute`
+
+--------
+
+.. f:subroutine:: addstep_compute_all(nextstep)
+
+   Add timestep to list of future compute invocations
+
+   .. versionadded:: TBD
+
+   overloaded for 32-bit and 64-bit integer arguments
+
+   :p integer(kind=8 or kind=4) nextstep: next timestep
+   :to: :cpp:func:`lammps_addstep_compute_all`
+
+--------
+
 .. f:subroutine:: gather_atoms(name, count, data)

   This function calls :cpp:func:`lammps_gather_atoms` to gather the named
--- a/doc/src/Howto_barostat.rst
+++ b/doc/src/Howto_barostat.rst
@ -10,20 +10,21 @@ and/or pressure (P) is specified by the user, and the thermostat or
 barostat attempts to equilibrate the system to the requested T and/or
 P.

-Barostatting in LAMMPS is performed by :doc:`fixes <fix>`.  Two
+Barostatting in LAMMPS is performed by :doc:`fixes <fix>`.  Three
 barostatting methods are currently available: Nose-Hoover (npt and
-nph) and Berendsen:
+nph), Berendsen, and various linear controllers in deform/pressure:

 * :doc:`fix npt <fix_nh>`
 * :doc:`fix npt/sphere <fix_npt_sphere>`
 * :doc:`fix npt/asphere <fix_npt_asphere>`
 * :doc:`fix nph <fix_nh>`
 * :doc:`fix press/berendsen <fix_press_berendsen>`
+* :doc:`fix deform/pressure <fix_deform_pressure>`

 The :doc:`fix npt <fix_nh>` commands include a Nose-Hoover thermostat
 and barostat.  :doc:`Fix nph <fix_nh>` is just a Nose/Hoover barostat;
-it does no thermostatting.  Both :doc:`fix nph <fix_nh>` and :doc:`fix press/berendsen <fix_press_berendsen>` can be used in conjunction
-with any of the thermostatting fixes.
+it does no thermostatting.  The fixes :doc:`nph <fix_nh>`, :doc:`press/berendsen <fix_press_berendsen>`, and :doc:`deform/pressure <fix_deform_pressure>`
+can be used in conjunction with any of the thermostatting fixes.

 As with the :doc:`thermostats <Howto_thermostat>`, :doc:`fix npt <fix_nh>`
 and :doc:`fix nph <fix_nh>` only use translational motion of the
@ -44,9 +45,9 @@ a temperature or pressure compute to a barostatting fix.
 .. note::

   As with the thermostats, the Nose/Hoover methods (:doc:`fix npt <fix_nh>` and :doc:`fix nph <fix_nh>`) perform time integration.
-   :doc:`Fix press/berendsen <fix_press_berendsen>` does NOT, so it should
-   be used with one of the constant NVE fixes or with one of the NVT
-   fixes.
+   :doc:`Fix press/berendsen <fix_press_berendsen>` and :doc:`fix deform/pressure <fix_deform_pressure>`
+   do NOT, so they should be used with one of the constant NVE fixes or with
+   one of the NVT fixes.

 Thermodynamic output, which can be setup via the
 :doc:`thermo_style <thermo_style>` command, often includes pressure
--- a/doc/src/Install_git.rst
+++ b/doc/src/Install_git.rst
@ -52,6 +52,7 @@ your machine and "release" is one of the 3 branches listed above.
 between them at any time using "git checkout <branch name>".)

 .. admonition:: Saving time and disk space when using ``git clone``
+   :class: note

   The complete git history of the LAMMPS project is quite large because
   it contains the entire commit history of the project since fall 2006,
--- a/doc/src/Library_objects.rst
+++ b/doc/src/Library_objects.rst
@ -13,6 +13,9 @@ fixes, or variables in LAMMPS using the following functions:
 - :cpp:func:`lammps_set_internal_variable`
 - :cpp:func:`lammps_variable_info`
 - :cpp:func:`lammps_eval`
+- :cpp:func:`lammps_clearstep_compute`
+- :cpp:func:`lammps_addstep_compute_all`
+- :cpp:func:`lammps_addstep_compute`

 -----------------------

@ -61,6 +64,21 @@ fixes, or variables in LAMMPS using the following functions:

 -----------------------

+.. doxygenfunction:: lammps_clearstep_compute
+   :project: progguide
+
+-----------------------
+
+.. doxygenfunction:: lammps_addstep_compute_all
+   :project: progguide
+
+-----------------------
+
+.. doxygenfunction:: lammps_addstep_compute
+   :project: progguide
+
+-----------------------
+
 .. doxygenenum:: _LMP_DATATYPE_CONST

 .. doxygenenum:: _LMP_STYLE_CONST
--- a/doc/src/Modify_compute.rst
+++ b/doc/src/Modify_compute.rst
@ -45,6 +45,8 @@ class.  See compute.h for details.
 +-----------------------+------------------------------------------------------------------+
 | pair_tally_callback   | callback function for *tally*\ -style computes (optional).       |
 +-----------------------+------------------------------------------------------------------+
+| modify_param          | called when a compute_modify request is executed (optional)      |
+-----------------------+------------------------------------------------------------------+
 | memory_usage          | tally memory usage (optional)                                    |
 +-----------------------+------------------------------------------------------------------+

--- a/doc/src/compute.rst
+++ b/doc/src/compute.rst
@ -236,6 +236,7 @@ The individual style names on the :doc:`Commands compute <Commands_compute>` pag
 * :doc:`fep/ta <compute_fep_ta>` - compute free energies for a test area perturbation
 * :doc:`force/tally <compute_tally>` - force between two groups of atoms via the tally callback mechanism
 * :doc:`fragment/atom <compute_cluster_atom>` - fragment ID for each atom
+* :doc:`gaussian/grid/local <compute_gaussian_grid_local>` - local array of Gaussian atomic contributions on a regular grid
 * :doc:`global/atom <compute_global_atom>` - assign global values to each atom from arrays of global values
 * :doc:`group/group <compute_group_group>` - energy/force between two groups of atoms
 * :doc:`gyration <compute_gyration>` - radius of gyration of group of atoms
--- a/doc/src/compute_gaussian_grid_local.rst
+++ b/doc/src/compute_gaussian_grid_local.rst
@ -0,0 +1,99 @@
+.. index:: compute gaussian/grid/local
+.. index:: compute gaussian/grid/local/kk
+
+compute gaussian/grid/local command
+===================================
+
+Accelerator Variants: *gaussian/grid/local/kk*
+
+Syntax
+""""""
+
+.. code-block:: LAMMPS
+
+   compute ID group-ID gaussian/grid/local grid nx ny nz rcutfac  R_1 R_2 ... sigma_1 sigma_2
+
+* ID, group-ID are documented in :doc:`compute <compute>` command
+* gaussian/grid/local = style name of this compute command
+* *grid* values = nx, ny, nz, number of grid points in x, y, and z directions (positive integer)
+* *rcutfac* = scale factor applied to all cutoff radii (positive real)
+* *R_1, R_2,...* = list of cutoff radii, one for each type (distance units)
+* *sigma_1, sigma_2,...* = Gaussian widths, one for each type (distance units)
+
+Examples
+""""""""
+
+.. code-block:: LAMMPS
+
+    compute mygrid all gaussian/grid/local grid 40 40 40 4.0 0.5 0.5 0.4 0.4
+
+Description
+"""""""""""
+
+.. versionadded:: TBD
+
+Define a computation that calculates a Gaussian representation of the ionic
+structure. This representation is used for the efficient evaluation
+of quantities related to the structure factor in a grid-based workflow,
+such as the ML-DFT workflow MALA :ref:`(Ellis) <Ellis2021b>`, for which it was originally
+implemented. Usage of the workflow is described in a separate publication :ref:`(Fiedler) <Fiedler2023>`.
+
+For each LAMMPS type, a separate sum of Gaussians is calculated, using
+a separate Gaussian broadening per type. The computation
+is always performed on the numerical grid, no atom-based version of this
+compute exists. The Gaussian representation can only be executed in a local
+fashion, thus the output array only contains rows for grid points
+that are local to the processor subdomain. The layout of the grid is the same
+as for the see :doc:`sna/grid/local <compute_sna_atom>` command.
+
+Namely, the array contains one row for each of the
+local grid points, looping over the global index *ix* fastest,
+then *iy*, and *iz* slowest.  Each row of the array contains
+the global indexes *ix*, *iy*, and *iz* first, followed by the *x*, *y*,
+and *z* coordinates of the grid point, followed by the values of the Gaussians
+(one floating point number per type per grid point).
+
+----------
+
+
+.. include:: accel_styles.rst
+
+
+
+----------
+
+Output info
+"""""""""""
+
+Compute *gaussian/grid/local* evaluates a local array.
+The array contains one row for each of the
+local grid points, looping over the global index *ix* fastest,
+then *iy*, and *iz* slowest.  The array contains math :math:`ntypes+6` columns,
+where *ntypes* is the number of LAMMPS types. The first three columns are
+the global indexes *ix*, *iy*, and *iz*, followed by the *x*, *y*,
+and *z* coordinates of the grid point, followed by the *ntypes* columns
+containing the values of the Gaussians for each type.
+
+Restrictions
+""""""""""""
+
+These computes are part of the ML-SNAP package.  They are only enabled
+if LAMMPS was built with that package.  See the :doc:`Build package
+<Build_package>` page for more info.
+
+Related commands
+""""""""""""""""
+
+:doc:`compute sna/grid/local <compute_sna_atom>`
+
+----------
+
+.. _Ellis2021b:
+
+**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) <https://doi.org/10.1103/PhysRevB.104.035120>`_
+
+.. _Fiedler2023:
+
+**(Fiedler)** Fiedler, Modine, Schmerler, Vogel, Popoola, Thompson, Rajamanickam, and Cangi,
+`npj Comp. Mater., 9, 115 (2023) <https://doi.org/10.1038/s41524-023-01070-z>`_
+
--- a/doc/src/compute_sna_atom.rst
+++ b/doc/src/compute_sna_atom.rst
@ -3,7 +3,9 @@
 .. index:: compute snav/atom
 .. index:: compute snap
 .. index:: compute sna/grid
+.. index:: compute sna/grid/kk
 .. index:: compute sna/grid/local
+.. index:: compute sna/grid/local/kk

 compute sna/atom command
 ========================
@ -20,9 +22,14 @@ compute snap command
 compute sna/grid command
 ========================

+compute sna/grid/kk command
+===========================
+
 compute sna/grid/local command
 ==============================

+Accelerator Variants: *sna/grid/local/kk*
+
 Syntax
 """"""

@ -33,17 +40,17 @@ Syntax
   compute ID group-ID snav/atom rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
   compute ID group-ID snap rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
   compute ID group-ID snap rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
-   compute ID group-ID sna/grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
-   compute ID group-ID sna/grid/local nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
+   compute ID group-ID sna/grid grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
+   compute ID group-ID sna/grid/local grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...

 * ID, group-ID are documented in :doc:`compute <compute>` command
 * sna/atom = style name of this compute command
-* rcutfac = scale factor applied to all cutoff radii (positive real)
-* rfac0 = parameter in distance to angle conversion (0 < rcutfac < 1)
-* twojmax = band limit for bispectrum components (non-negative integer)
-* R_1, R_2,... = list of cutoff radii, one for each type (distance units)
-* w_1, w_2,... = list of neighbor weights, one for each type
-* nx, ny, nz = number of grid points in x, y, and z directions (positive integer)
+* *rcutfac* = scale factor applied to all cutoff radii (positive real)
+* *rfac0* = parameter in distance to angle conversion (0 < rcutfac < 1)
+* *twojmax* = band limit for bispectrum components (non-negative integer)
+* *R_1, R_2,...* = list of cutoff radii, one for each type (distance units)
+* *w_1, w_2,...* = list of neighbor weights, one for each type
+* *grid* values = nx, ny, nz, number of grid points in x, y, and z directions (positive integer)
 * zero or more keyword/value pairs may be appended
 * keyword = *rmin0* or *switchflag* or *bzeroflag* or *quadraticflag* or *chem* or *bnormflag* or *wselfallflag* or *bikflag* or *switchinnerflag* or *sinner* or *dinner* or *dgradflag* or *nnn* or *wmode* or *delta*

@ -103,7 +110,7 @@ Examples
   compute snap all snap 1.4 0.95 6 2.0 1.0
   compute snap all snap 1.0 0.99363 6 3.81 3.83 1.0 0.93 chem 2 0 1
   compute snap all snap 1.0 0.99363 6 3.81 3.83 1.0 0.93 switchinnerflag 1 sinner 1.35 1.6 dinner 0.25 0.3
-   compute bgrid all sna/grid/local 200 200 200 1.4 0.95 6 2.0 1.0
+   compute bgrid all sna/grid/local grid 200 200 200 1.4 0.95 6 2.0 1.0
   compute bnnn all sna/atom 9.0 0.99363 8 0.5 1.0 rmin0 0.0 nnn 24 wmode 1 delta 0.2

 Description
@ -252,7 +259,8 @@ for finite-temperature Kohn-Sham density functional theory (:ref:`Ellis
 et al. <Ellis2021>`) Neighbor atoms not in the group do not contribute
 to the bispectrum components of the grid points. The distance cutoff
 :math:`R_{ii'}` assumes that *i* has the same type as the neighbor atom
-*i'*.
+*i'*. Both computes can be hardware accelerated with Kokkos by using the
+*sna/grid/kk* and *sna/grid/local/kk* commands, respectively.

 Compute *sna/grid* calculates a global array containing bispectrum
 components for a regular grid of points.
@ -463,6 +471,12 @@ fluctuations in the resulting local atomic environment fingerprint.  The
 detailed formalism is given in the paper by Lafourcade et
 al. :ref:`(Lafourcade) <Lafourcade2023_2>`.

+----------
+
+
+.. include:: accel_styles.rst
+
+
 ----------

 Output info
@ -654,7 +668,7 @@ of Angular Momentum, World Scientific, Singapore (1987).

 .. _Ellis2021:

-**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam,  Phys Rev B, 104, 035120, (2021)
+**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) <https://doi.org/10.1103/PhysRevB.104.035120>`_

 .. _Lafourcade2023_2:

--- a/doc/src/fix_python_invoke.rst
+++ b/doc/src/fix_python_invoke.rst
@ -66,6 +66,15 @@ gives access to the LAMMPS state from Python.
   from these callbacks, trying to execute input script commands will in the best
   case not work or in the worst case result in undefined behavior.

+Restart, fix_modify, output, run start/stop, minimize info
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+No information about this fix is written to :doc:`binary restart files <restart>`.  None of the :doc:`fix_modify <fix_modify>` options
+are relevant to this fix.  No global or per-atom quantities are stored
+by this fix for access by various :doc:`output commands <Howto_output>`.
+No parameter of this fix can be used with the *start/stop* keywords of
+the :doc:`run <run>` command.  This fix is not invoked during :doc:`energy minimization <minimize>`.
+
 Restrictions
 """"""""""""

--- a/doc/src/fix_reaxff_species.rst
+++ b/doc/src/fix_reaxff_species.rst
@ -200,8 +200,8 @@ The 2 values in the global vector are as follows:
 The per-atom vector stores the molecule ID for each atom as identified
 by the fix.  If an atom is not in a molecule, its ID will be 0.
 For atoms in the same molecule, the molecule ID for all of them
-will be the same and will be equal to the smallest atom ID of
-any atom in the molecule.
+will be the same, and molecule IDs will range from 1 to the number
+of molecules.

 No parameter of this fix can be used with the *start/stop* keywords of
 the :doc:`run <run>` command.
--- a/doc/src/fix_wall_gran.rst
+++ b/doc/src/fix_wall_gran.rst
@ -222,10 +222,10 @@ restart file, so that the operation of the fix continues in an
 uninterrupted fashion.

 If the :code:`contacts` option is used, this fix generates a per-atom array
-with 8 columns as output, containing the contact information for owned
+with at least 8 columns as output, containing the contact information for owned
 particles (nlocal on each processor). All columns in this per-atom array will
-be zero if no contact has occurred.  The values of these columns are listed in
-the following table:
+be zero if no contact has occurred.  The first 8 values of these columns are
+listed in the following table.

 +-------+----------------------------------------------------+----------------+
 | Index | Value                                              | Units          |
@ -248,6 +248,14 @@ the following table:
 |     8 | Radius :math:`r` of atom                           | distance units |
 +-------+----------------------------------------------------+----------------+

+If a granular sub-model calculates additional contact information (e.g. the
+heat sub-models calculate the amount of heat exchanged), these quantities
+are appended to the end of this array. First, any extra values from the
+normal sub-model are appended followed by the damping, tangential, rolling,
+twisting, then heat models. See the descriptions of granular sub-models in
+the :doc:`pair granular <pair_granular>` page for information on any extra
+quantities.
+
 None of the :doc:`fix_modify <fix_modify>` options are relevant to this fix.
 No parameter of this fix can be used with the *start/stop* keywords of the
 :doc:`run <run>` command. This fix is not invoked during :doc:`energy
--- a/doc/src/fix_wall_gran_region.rst
+++ b/doc/src/fix_wall_gran_region.rst
@ -243,10 +243,10 @@ uninterrupted fashion.
   with a different region ID.

 If the :code:`contacts` option is used, this fix generates a per-atom array
-with 8 columns as output, containing the contact information for owned
+with at least 8 columns as output, containing the contact information for owned
 particles (nlocal on each processor). All columns in this per-atom array will
-be zero if no contact has occurred. The values of these columns are listed in
-the following table:
+be zero if no contact has occurred.  The first 8 values of these columns are
+listed in the following table.

 +-------+----------------------------------------------------+----------------+
 | Index | Value                                              | Units          |
@ -269,6 +269,14 @@ the following table:
 |     8 | Radius :math:`r` of atom                           | distance units |
 +-------+----------------------------------------------------+----------------+

+If a granular sub-model calculates additional contact information (e.g. the
+heat sub-models calculate the amount of heat exchanged), these quantities
+are appended to the end of this array. First, any extra values from the
+normal sub-model are appended followed by the damping, tangential, rolling,
+twisting, then heat models. See the descriptions of granular sub-models in
+the :doc:`pair granular <pair_granular>` page for information on any extra
+quantities.
+
 None of the :doc:`fix_modify <fix_modify>` options are relevant to this fix.
 No parameter of this fix can be used with the *start/stop* keywords of the
 :doc:`run <run>` command. This fix is not invoked during :doc:`energy
--- a/doc/src/kspace_modify.rst
+++ b/doc/src/kspace_modify.rst
@ -412,11 +412,9 @@ slab correction has also been extended to point dipole interactions
 .. note::

   If you wish to apply an electric field in the Z-direction, in
-   conjunction with the *slab* keyword, you should do it by adding
-   explicit charged particles to the +/- Z surfaces.  If you do it via
-   the :doc:`fix efield <fix_efield>` command, it will not give the correct
-   dielectric constant due to the Yeh/Berkowitz :ref:`(Yeh) <Yeh>` correction
-   not being compatible with how :doc:`fix efield <fix_efield>` works.
+   conjunction with the *slab* keyword, you can do it either by
+   adding explicit oppositely charged particles to the +/- Z surfaces,
+   or by using the :doc:`fix efield <fix_efield>` command.

 ----------

--- a/doc/src/pair_granular.rst
+++ b/doc/src/pair_granular.rst
@ -40,6 +40,9 @@ Examples
   pair_style granular
   pair_coeff * * hertz 1000.0 50.0 tangential mindlin 1000.0 1.0 0.4 heat area 0.1

+   pair_style granular
+   pair_coeff * * mdr 5e6 0.4 1.9e5 2.0 0.5 0.5 tangential linear_history 940.0 0.0 0.7 rolling sds 2.7e5 0.0 0.6 damping none
+
 Description
 """""""""""

@ -82,6 +85,7 @@ and their required arguments are:
 3. *hertz/material* : E, :math:`\eta_{n0}` (or :math:`e`), :math:`\nu`
 4. *dmt* : E, :math:`\eta_{n0}` (or :math:`e`), :math:`\nu`, :math:`\gamma`
 5. *jkr* : E, :math:`\eta_{n0}` (or :math:`e`), :math:`\nu`, :math:`\gamma`
+6. *mdr* : :math:`E`, :math:`\nu`, :math:`Y`, :math:`\Delta\gamma`, :math:`\psi_b`, :math:`e`

 Here, :math:`k_n` is spring stiffness (with units that depend on model
 choice, see below); :math:`\eta_{n0}` is a damping prefactor (or, in its
@ -162,6 +166,144 @@ initially will not experience force until they come into contact
 experience a tensile force up to :math:`3\pi\gamma R`, at which point they
 lose contact.

+The *mdr* model is a mechanically-derived contact model designed to capture the
+contact response between adhesive elastic-plastic particles into large deformation.
+The theoretical foundations of the *mdr* model are detailed in the
+two-part series :ref:`Zunker and Kamrin Part I <Zunker2024I>` and
+:ref:`Zunker and Kamrin Part II <Zunker2024II>`. Further development
+and demonstrations of its application to industrially relevant powder
+compaction processes are presented in :ref:`Zunker et al. <Zunker2025>`.
+
+The model requires the following inputs:
+
+   1. *Young's modulus* :math:`E > 0` : The Young's modulus is commonly reported
+   for various powders.
+
+   2. *Poisson's ratio* :math:`0 \le \nu \le 0.5` : The Poisson's ratio is commonly
+   reported for various powders.
+
+   3. *Yield stress* :math:`Y \ge 0` : The yield stress is often known for powders
+   composed of materials such as metals but may be unreported for ductile organic
+   materials, in which case it can be treated as a free parameter.
+
+   4. *Effective surface energy* :math:`\Delta\gamma \ge 0` : The effective surface
+   energy for powder compaction applications is most easily determined through its
+   relation to the more commonly reported critical stress intensity factor
+   :math:`K_{Ic} = \sqrt{2\Delta\gamma E/(1-\nu^2)}`.
+
+   5. *Critical confinement ratio* :math:`0 \le \psi_b \le 1` : The critical confinement
+   ratio is a tunable parameter that determines when the bulk elastic response is
+   triggered. Lower values of :math:`\psi_b` delay the onset of the bulk elastic
+   response.
+
+   6. *Coefficient of restitution* :math:`0 \le e \le 1` : The coefficient of
+   restitution is a tunable parameter that controls damping in the normal direction.
+
+.. note::
+
+   The values for :math:`E`, :math:`\nu`, :math:`Y`, and :math:`\Delta\gamma` (i.e.,
+   :math:`K_{Ic}`) should be selected for zero porosity to reflect the intrinsic
+   material property rather than the bulk powder property.
+
+The *mdr* model produces a nonlinear force-displacement response, therefore the
+critical timestep :math:`\Delta t` depends on the inputs and level of
+deformation. As a conservative starting point the timestep can be assumed to be
+dictated by the bulk elastic response such that
+:math:`\Delta t = 0.35\sqrt{m/k_\textrm{bulk}}`, where :math:`m` is the mass of
+the smallest particle and :math:`k_\textrm{bulk} = \kappa R_\textrm{min}` is an
+effective stiffness related to the bulk elastic response.
+Here, :math:`\kappa = E/(3(1-2\nu))` is the bulk modulus and
+:math:`R_\textrm{min}` is the radius of the smallest particle.
+
+.. note::
+
+   The *mdr* model requires some specific settings to function properly,
+   please read the following text carefully to ensure all requirements are
+   followed.
+
+The *atom_style* must be set to *sphere 1* to enable dynamic particle
+radii. The *mdr* model is designed to respect the incompressibility of
+plastic deformation and inherently tracks free surface displacements
+induced by all particle contacts. In practice, this means that all particles
+begin with an initial radius, however as compaction occurs and plastic
+deformation is accumulated, a new enlarged apparent radius is defined to
+ensure that that volume change due to plastic deformation is not lost.
+This apparent radius is stored as the *atom radius* meaning it is used
+for subsequent neighbor list builds and contact detection checks. The
+advantage of this is that multi-neighbor dependent effects such as
+formation of secondary contacts caused by radial expansion are captured
+by the *mdr* model. Setting *atom_style sphere 1* ensures that updates to
+the particle radii are properly reflected throughout the simulation.
+
+.. code-block:: LAMMPS
+
+   atom_style sphere 1
+
+Newton's third law must be set to *off*. This ensures that the neighbor lists
+are constructed properly for the topological penalty algorithm used to screen
+for non-physical contacts occurring through obstructing particles, an issue
+prevalent under large deformation conditions. For more information on this
+algorithm see :ref:`Zunker et al. <Zunker2025>`.
+
+.. code-block:: LAMMPS
+
+   newton off
+
+The damping model must be set to *none*. The *mdr* model already has a built
+in damping model.
+
+.. code-block:: LAMMPS
+
+   pair_coeff * * mdr 5e6 0.4 1.9e5 2 0.5 0.5 damping none
+
+The definition of multiple *mdr* models in the *pair_style* is currently not
+supported. Similarly, the *mdr* model cannot be combined with a different normal
+model in the *pair_style*. Physically this means that only one homogeneous
+collection of particles governed by a single *mdr* model is allowed.
+
+The *mdr* model currently only supports *fix wall/gran/region*, not
+*fix wall/gran*. If the *mdr* model is specified for the *pair_style*
+any *fix wall/gran/region* commands must also use the *mdr* model.
+Additionally, the following *mdr* inputs must match between the
+*pair_style* and *fix wall/gran/region* definitions: :math:`E`,
+:math:`\nu`, :math:`Y`, :math:`\psi_b`, and :math:`e`. The exception
+is :math:`\Delta\gamma`, which may vary, permitting different
+adhesive behaviors between particle-particle and particle-wall interactions.
+
+.. note::
+
+   The *mdr* model has a number of custom *property/atom* and *pair/local* definitions that
+   can be called in the input file. The useful properties for visualization
+   and analysis are described below.
+
+In addition to contact forces the *mdr* model also tracks the following
+quantities for each particle: elastic volume change, average normal
+stress components, total surface area involved in
+contact, and individual contact areas. In the input script, these quantities are
+initialized by calling *run 0* and can then be accessed using subsequent *compute*
+commands. The last *compute* command uses *pair/local p13* to calculate the pairwise
+contact areas for each active contact in the *group-ID*. Due to the use of an apparent
+radius in the *mdr* model, the keyword/arg pair *cutoff radius* must be specified for
+*pair/local* to properly detect existing contacts.
+
+.. code-block:: LAMMPS
+
+   run 0
+   compute ID group-ID property/atom d_Velas
+   compute ID group-ID property/atom d_sigmaxx
+   compute ID group-ID property/atom d_sigmayy
+   compute ID group-ID property/atom d_sigmazz
+   compute ID group-ID property/atom d_Acon1
+   compute ID group-ID pair/local p13 cutoff radius
+
+.. note::
+
+   The *mdr* model has two example input scripts within the
+   *examples/granular* directory. The first is a die compaction
+   simulation involving 200 particles named *in.tableting.200*.
+   The second is a triaxial compaction simulation involving 12
+   particles named *in.triaxial.compaction.12*.
+
 ----------

 In addition, the normal force is augmented by a damping term of the
@ -674,7 +816,10 @@ supported are:
 2. *radius* : :math:`k_{s}`
 3. *area* : :math:`h_{s}`

-If the *heat* keyword is not specified, the model defaults to *none*.
+If the *heat* keyword is not specified, the model defaults to *none*. All
+heat models calculate an additional pairwise quantity accessible by the
+single() function (described below) which is the heat conducted between the
+two particles.

 For *heat* *radius*, the heat
 :math:`Q` conducted between two particles is given by
@ -789,7 +934,7 @@ The single() function of these pair styles returns 0.0 for the energy
 of a pairwise interaction, since energy is not conserved in these
 dissipative potentials.  It also returns only the normal component of
 the pairwise interaction force.  However, the single() function also
-calculates 13 extra pairwise quantities.  The first 3 are the
+calculates at least 13 extra pairwise quantities.  The first 3 are the
 components of the tangential force between particles I and J, acting
 on particle I.  The fourth is the magnitude of this tangential force.
 The next 3 (5-7) are the components of the rolling torque acting on
@ -797,9 +942,17 @@ particle I. The next entry (8) is the magnitude of the rolling torque.
 The next entry (9) is the magnitude of the twisting torque acting
 about the vector connecting the two particle centers.
 The next 3 (10-12) are the components of the vector connecting
-the centers of the two particles (x_I - x_J). The last quantity (13)
-is the heat flow between the two particles, set to 0 if no heat model
-is active.
+the centers of the two particles (x_I - x_J). If a granular sub-model
+calculates additional contact information (e.g. the heat sub-models
+calculate the amount of heat exchanged), these quantities are appended
+to the end of this list. First, any extra values from the normal sub-model
+are appended followed by the damping, tangential, rolling, twisting, then
+heat models. See the descriptions of specific granular sub-models above
+for information on any extra quantities. If two or more models are
+defined by pair coefficients, the size of the array is set by the
+maximum number of extra quantities in a model but the order of quantities
+is determined by each model's specific set of sub-models. Any unused
+quantities are zeroed.

 These extra quantities can be accessed by the :doc:`compute pair/local <compute_pair_local>` command, as *p1*, *p2*, ...,
 *p12*\ .
@ -870,10 +1023,32 @@ solids. Proc. R. Soc. Lond. A, 324(1558), 301-313.

 .. _DMT1975:

-**Derjaguin et al, 1975)** Derjaguin, B. V., Muller, V. M., & Toporov,
+**(Derjaguin et al, 1975)** Derjaguin, B. V., Muller, V. M., & Toporov,
 Y. P. (1975). Effect of contact deformations on the adhesion of
 particles. Journal of Colloid and interface science, 53(2), 314-326.

+.. _Zunker2024I:
+
+**(Zunker and Kamrin, 2024)** Zunker, W., & Kamrin, K. (2024).
+A mechanically-derived contact model for adhesive elastic-perfectly
+plastic particles, Part I: Utilizing the method of dimensionality
+reduction. Journal of the Mechanics and Physics of Solids, 183, 105492.
+
+.. _Zunker2024II:
+
+**(Zunker and Kamrin, 2024)** Zunker, W., & Kamrin, K. (2024).
+A mechanically-derived contact model for adhesive elastic-perfectly
+plastic particles, Part II: Contact under high compaction-modeling
+a bulk elastic response. Journal of the Mechanics and Physics of Solids,
+183, 105493.
+
+.. _Zunker2025:
+
+**(Zunker et al, 2025)** Zunker, W., Dunatunga, S., Thakur, S.,
+Tang, P., & Kamrin, K. (2025). Experimentally validated DEM for large
+deformation powder compaction: mechanically-derived contact model and
+screening of non-physical contacts.
+
 .. _Luding2008:

 **(Luding, 2008)** Luding, S. (2008). Cohesive, frictional powders:
--- a/doc/src/pair_hbond_dreiding.rst
+++ b/doc/src/pair_hbond_dreiding.rst
@ -1,30 +1,46 @@
 .. index:: pair_style hbond/dreiding/lj
 .. index:: pair_style hbond/dreiding/lj/omp
+.. index:: pair_style hbond/dreiding/lj/angleoffset
+.. index:: pair_style hbond/dreiding/lj/angleoffset/omp
 .. index:: pair_style hbond/dreiding/morse
 .. index:: pair_style hbond/dreiding/morse/omp
+.. index:: pair_style hbond/dreiding/morse/angleoffset
+.. index:: pair_style hbond/dreiding/morse/angleoffset/omp

 pair_style hbond/dreiding/lj command
 ====================================

 Accelerator Variants: *hbond/dreiding/lj/omp*

+pair_style hbond/dreiding/lj/angleoffset command
+================================================
+
+Accelerator Variants: *hbond/dreiding/lj/angleoffset/omp*
+
 pair_style hbond/dreiding/morse command
 =======================================

 Accelerator Variants: *hbond/dreiding/morse/omp*

+pair_style hbond/dreiding/morse/angleoffset command
+===================================================
+
+Accelerator Variants: *hbond/dreiding/morse/angleoffset/omp*
+
+
 Syntax
 """"""

 .. code-block:: LAMMPS

-   pair_style style N inner_distance_cutoff outer_distance_cutoff angle_cutoff
+   pair_style style N inner_distance_cutoff outer_distance_cutoff angle_cutoff equilibrium_angle

-* style = *hbond/dreiding/lj* or *hbond/dreiding/morse*
+* style = *hbond/dreiding/lj* or *hbond/dreiding/morse* or *hbond/dreiding/lj/angleoffset* or *hbond/dreiding/morse/angleoffset*
 * N = power of cosine of angle theta (integer)
 * inner_distance_cutoff = global inner cutoff for Donor-Acceptor interactions (distance units)
 * outer_distance_cutoff = global cutoff for Donor-Acceptor interactions (distance units)
 * angle_cutoff = global angle cutoff for Acceptor-Hydrogen-Donor interactions (degrees)
+* (with style angleoffset) equilibrium_angle = global equilibrium angle for Acceptor-Hydrogen-Donor interactions (degrees)

 Examples
 """"""""
@ -40,6 +56,9 @@ Examples
   labelmap atom 1 C 2 O 3 H
   pair_coeff C O hbond/dreiding/morse H i 3.88 1.7241379 2.9 2 9.0 11.0 90.0

+   pair_style hybrid/overlay lj/cut 10.0 hbond/dreiding/lj 4 9.0 11.0 90 170.0
+   pair_coeff 1 2 hbond/dreiding/lj 3 i 9.5 2.75 4 9.0 11.0 90.0
+
 Description
 """""""""""

@ -74,42 +93,53 @@ hydrogen (H) and the donor atoms:
 .. image:: JPG/dreiding_hbond.jpg
   :align: center

-These 3-body interactions can be defined for pairs of acceptor and
-donor atoms, based on atom types.  For each donor/acceptor atom pair,
-the third atom in the interaction is a hydrogen permanently bonded to
-the donor atom, e.g. in a bond list read in from a data file via the
+These 3-body interactions can be defined for pairs of acceptor and donor
+atoms, based on atom types.  For each donor/acceptor atom pair, the
+third atom in the interaction is a hydrogen permanently bonded to the
+donor atom, e.g. in a bond list read in from a data file via the
 :doc:`read_data <read_data>` command.  The atom types of possible
 hydrogen atoms for each donor/acceptor type pair are specified by the
 :doc:`pair_coeff <pair_coeff>` command (see below).

 Style *hbond/dreiding/lj* is the original DREIDING potential of
-:ref:`(Mayo) <pair-Mayo>`.  It uses a LJ 12/10 functional for the Donor-Acceptor
-interactions. To match the results in the original paper, use n = 4.
+:ref:`(Mayo) <pair-Mayo>`.  It uses a LJ 12/10 functional for the
+Donor-Acceptor interactions. To match the results in the original paper,
+use n = 4.

 Style *hbond/dreiding/morse* is an improved version using a Morse
 potential for the Donor-Acceptor interactions. :ref:`(Liu) <Liu>` showed
 that the Morse form gives improved results for Dendrimer simulations,
 when n = 2.

+.. versionadded:: TBD
+
+The style variants *hbond/dreiding/lj/angleoffset* and
+*hbond/dreiding/lj/angleoffset* take the equilibrium angle of the AHD as
+input, allowing it to reach 180 degrees. This variant option was added
+to account for cases (especially in some coarse-grained models) in which
+the equilibrium state of the bonds may equal the minimum energy state.
+
 See the :doc:`Howto bioFF <Howto_bioFF>` page for more information
 on the DREIDING force field.

 .. note::

-   Because the Dreiding hydrogen bond potential is only one portion
-   of an overall force field which typically includes other pairwise
-   interactions, it is common to use it as a sub-style in a :doc:`pair_style hybrid/overlay <pair_hybrid>` command, where another pair style
-   provides the repulsive core interaction between pairs of atoms, e.g. a
-   1/r\^12 Lennard-Jones repulsion.
+   Because the Dreiding hydrogen bond potential is only one portion of
+   an overall force field which typically includes other pairwise
+   interactions, it is common to use it as a sub-style in a
+   :doc:`pair_style hybrid/overlay <pair_hybrid>` command, where another
+   pair style provides the repulsive core interaction between pairs of
+   atoms, e.g. a 1/r\^12 Lennard-Jones repulsion.

 .. note::

-   When using the hbond/dreiding pair styles with :doc:`pair_style hybrid/overlay <pair_hybrid>`, you should explicitly define pair
+   When using the hbond/dreiding pair styles with :doc:`pair_style
+   hybrid/overlay <pair_hybrid>`, you should explicitly define pair
   interactions between the donor atom and acceptor atoms, (as well as
   between these atoms and ALL other atoms in your system).  Whenever
-   :doc:`pair_style hybrid/overlay <pair_hybrid>` is used, ordinary mixing
-   rules are not applied to atoms like the donor and acceptor atoms
-   because they are typically referenced in multiple pair styles.
+   :doc:`pair_style hybrid/overlay <pair_hybrid>` is used, ordinary
+   mixing rules are not applied to atoms like the donor and acceptor
+   atoms because they are typically referenced in multiple pair styles.
   Neglecting to do this can cause difficult-to-detect physics problems.

 .. note::
@ -119,6 +149,13 @@ on the DREIDING force field.
   special_bonds command (e.g. "special_bonds lj 0.0 0.0 1.0") to turn
   these interactions on.

+.. note::
+
+   For the *angleoffset* variants, the referenced angle offset is the
+   supplementary angle of the equilibrium angle parameter. It means if
+   the equilibrium angle is 166.6 degrees, the calculated angle offset
+   is 13.4 degrees.
+
 ----------

 The following coefficients must be defined for pairs of eligible
@ -169,7 +206,10 @@ follows:
 * distance cutoff :math:`r_{out}` (distance units)
 * angle cutoff (degrees)

-A single hydrogen atom type K can be specified, or a wild-card asterisk
+For both the *hbond/dreiding/lj/angleoffset* and *hbond/dreiding/morse/angleoffset* styles an additional parameter is added:
+* equilibrium angle (degrees)
+
+For all styles, a single hydrogen atom type K can be specified, or a wild-card asterisk
 can be used in place of or in conjunction with the K arguments to
 select multiple types as hydrogen atoms.  This takes the form
 "\*" or "\*n" or "n\*" or "m\*n".  See the :doc:`pair_coeff <pair_coeff>`
@ -245,8 +285,7 @@ heading) the following commands could be included in an input script:
 Restrictions
 """"""""""""

-This pair style can only be used if LAMMPS was built with the
-MOLECULE package.  See the :doc:`Build package <Build_package>` doc page
+The base pair styles can only be used if LAMMPS was built with the MOLECULE package.  The *angleoffset* variant also requires the EXTRA-MOLECULE package.  See the :doc:`Build package <Build_package>` doc page
 for more info.

 Related commands
--- a/doc/src/pair_mliap.rst
+++ b/doc/src/pair_mliap.rst
@ -145,6 +145,7 @@ per line.
 The detail of *nn* module implementation can be found at :ref:`(Yanxon) <Yanxon2020>`.

 .. admonition:: Notes on mliappy models
+   :class: note

   When the *model* keyword is *mliappy*, if the filename ends in '.pt',
   or '.pth', it will be loaded using pytorch; otherwise, it will be
--- a/doc/src/pair_reaxff.rst
+++ b/doc/src/pair_reaxff.rst
@ -158,11 +158,36 @@ drops to zero.
 Optional keywords *safezone*, *mincap*, and *minhbonds* are used
 for allocating reaxff arrays.  Increasing these values can avoid memory
 problems, such as segmentation faults and bondchk failed errors, that
-could occur under certain conditions. These keywords are not used by
+could occur under certain conditions. These keywords are **not** used by
 the Kokkos version, which instead uses a more robust memory allocation
 scheme that checks if the sizes of the arrays have been exceeded and
 automatically allocates more memory.

+.. admonition:: Memory management problems with ReaxFF
+   :class: tip
+
+   The LAMMPS implementation of ReaxFF is adapted from a standalone MD
+   program written in C called `PuReMD
+   <https://github.com/msu-sparta/PuReMD>`_.  It inherits from this code
+   a heuristic memory management that is different from what the rest of
+   LAMMPS uses.  It assumes that a system is dense and already well
+   equilibrated, so that there are no large changes in how many and what
+   types of neighbors atoms have.  However, not all systems are like
+   that, and thus there can be errors or segmentation faults if the
+   system changes too much.  If you run into problems, here are three
+   options to avoid them:
+
+   - Use the KOKKOS version of ReaxFF (KOKKOS is not only for GPUs,
+     but can also be compiled for serial or OpenMP execution) which
+     uses a different memory management approach.
+   - Break down a run command during which memory related errors happen
+     into multiple smaller segments so that the memory management
+     heuristics are re-initialized for each segment before they become
+     invalid.
+   - Increase the values for *safezone*, *mincap*, and *minhbonds* as
+     needed.  This can lead to significant increase of memory consumption
+     through.
+
 The keyword *tabulate* controls the size of interpolation table for
 Lennard-Jones and Coulomb interactions. Tabulation may also be set in the
 control file (see below). If tabulation is set in both the input script and the
--- a/doc/src/pair_style.rst
+++ b/doc/src/pair_style.rst
@ -207,7 +207,9 @@ accelerated styles exist.
 * :doc:`gw/zbl <pair_gw>` - Gao-Weber potential with a repulsive ZBL core
 * :doc:`harmonic/cut <pair_harmonic_cut>` - repulsive-only harmonic potential
 * :doc:`hbond/dreiding/lj <pair_hbond_dreiding>` - DREIDING hydrogen bonding LJ potential
+* :doc:`hbond/dreiding/lj/angleoffset <pair_hbond_dreiding>` - DREIDING hydrogen bonding LJ potential with offset for hbond angle
 * :doc:`hbond/dreiding/morse <pair_hbond_dreiding>` - DREIDING hydrogen bonding Morse potential
+* :doc:`hbond/dreiding/morse/angleoffset <pair_hbond_dreiding>` - DREIDING hydrogen bonding Morse potential with offset for hbond angle
 * :doc:`hdnnp <pair_hdnnp>` - High-dimensional neural network potential
 * :doc:`hippo <pair_amoeba>` -
 * :doc:`ilp/graphene/hbn <pair_ilp_graphene_hbn>` - registry-dependent interlayer potential (ILP)
--- a/doc/src/variable.rst
+++ b/doc/src/variable.rst
@ -56,7 +56,7 @@ Syntax
                          random(x,y,z), normal(x,y,z), ceil(x), floor(x), round(x), ternary(x,y,z),
                          ramp(x,y), stagger(x,y), logfreq(x,y,z), logfreq2(x,y,z),
                          logfreq3(x,y,z), stride(x,y,z), stride2(x,y,z,a,b,c),
-                          vdisplace(x,y), swiggle(x,y,z), cwiggle(x,y,z)
+                          vdisplace(x,y), swiggle(x,y,z), cwiggle(x,y,z), sign(x)
         group functions = count(group), mass(group), charge(group),
                           xcm(group,dim), vcm(group,dim), fcm(group,dim),
                           bound(group,dir), gyration(group), ke(group),
@ -532,37 +532,37 @@ functions, special functions, feature functions, atom values, atom
 vectors, custom atom properties, compute references, fix references, and references to other
 variables.

-+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Number                 | 0.2, 100, 1.0e20, -15.4, etc                                                                                                                                                                                                                                                                                                                               |
-+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Constant               | PI, version, on, off, true, false, yes, no                                                                                                                                                                                                                                                                                                                 |
-+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Thermo keywords        | vol, pe, ebond, etc                                                                                                                                                                                                                                                                                                                                        |
-+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Math operators         | (), -x, x+y, x-y, x\*y, x/y, x\^y, x%y, x == y, x != y, x < y, x <= y, x > y, x >= y, x && y, x \|\| y, x \|\^ y, !x                                                                                                                                                                                                                                       |
-+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| Math functions         | sqrt(x), exp(x), ln(x), log(x), abs(x), sin(x), cos(x), tan(x), asin(x), acos(x), atan(x), atan2(y,x), random(x,y,z), normal(x,y,z), ceil(x), floor(x), round(x), ternary(x,y,z), ramp(x,y), stagger(x,y), logfreq(x,y,z), logfreq2(x,y,z), logfreq3(x,y,z), stride(x,y,z), stride2(x,y,z,a,b,c), vdisplace(x,y), swiggle(x,y,z), cwiggle(x,y,z)  |
-+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Math functions         | sqrt(x), exp(x), ln(x), log(x), abs(x), sin(x), cos(x), tan(x), asin(x), acos(x), atan(x), atan2(y,x), random(x,y,z), normal(x,y,z), ceil(x), floor(x), round(x), ternary(x,y,z), ramp(x,y), stagger(x,y), logfreq(x,y,z), logfreq2(x,y,z), logfreq3(x,y,z), stride(x,y,z), stride2(x,y,z,a,b,c), vdisplace(x,y), swiggle(x,y,z), cwiggle(x,y,z), sign(x)  |
+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Group functions        | count(ID), mass(ID), charge(ID), xcm(ID,dim), vcm(ID,dim), fcm(ID,dim), bound(ID,dir), gyration(ID), ke(ID), angmom(ID,dim), torque(ID,dim), inertia(ID,dimdim), omega(ID,dim)                                                                                                                                                                             |
-+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Region functions       | count(ID,IDR), mass(ID,IDR), charge(ID,IDR), xcm(ID,dim,IDR), vcm(ID,dim,IDR), fcm(ID,dim,IDR), bound(ID,dir,IDR), gyration(ID,IDR), ke(ID,IDR), angmom(ID,dim,IDR), torque(ID,dim,IDR), inertia(ID,dimdim,IDR), omega(ID,dim,IDR)                                                                                                                         |
-+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Special functions      | sum(x), min(x), max(x), ave(x), trap(x), slope(x), sort(x), rsort(x), gmask(x), rmask(x), grmask(x,y), next(x), is_file(name), is_os(name), extract_setting(name), label2type(kind,label), is_typelabel(kind,label), is_timeout()                                                                                                                          |
-+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Feature functions      | is_available(category,feature), is_active(category,feature), is_defined(category,id)                                                                                                                                                                                                                                                                       |
-+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Atom values            | id[i], mass[i], type[i], mol[i], x[i], y[i], z[i], vx[i], vy[i], vz[i], fx[i], fy[i], fz[i], q[i]                                                                                                                                                                                                                                                          |
-+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Atom vectors           | id, mass, type, mol, x, y, z, vx, vy, vz, fx, fy, fz, q                                                                                                                                                                                                                                                                                                    |
-+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Custom atom properties | i_name, d_name, i_name[i], d_name[i], i2_name[i], d2_name[i], i2_name[i][j], d_name[i][j]                                                                                                                                                                                                                                                                  |
-+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Compute references     | c_ID, c_ID[i], c_ID[i][j], C_ID, C_ID[i]                                                                                                                                                                                                                                                                                                                   |
-+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Fix references         | f_ID, f_ID[i], f_ID[i][j], F_ID, F_ID[i]                                                                                                                                                                                                                                                                                                                   |
-+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Other variables        | v_name, v_name[i]                                                                                                                                                                                                                                                                                                                                          |
-+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

 Most of the formula elements produce a scalar value.  Some produce a
 global or per-atom vector of values.  Global vectors can be produced
@ -860,6 +860,9 @@ run, according to one of these formulas, where omega = 2 PI / period:

 where dt = the timestep size.

+The sign(x) function returns 1.0 if the value is greater than or equal
+to 0.0, and -1.0 otherwise.
+
 The run begins on startstep.  Startstep can span multiple runs, using
 the *start* keyword of the :doc:`run <run>` command.  See the :doc:`run
 <run>` command for details of how to do this.  Note that the
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@ -108,6 +108,7 @@ Andrienko
 Andzelm
 Ang
 anglegrad
+angleoffset
 angletangrad
 angmom
 angmomx
@ -1236,6 +1237,7 @@ fp
 fphi
 fPIC
 fplo
+fprintf
 Fqq
 Fraige
 framerate
@ -1582,6 +1584,7 @@ Impropers
 imulator
 includelink
 incompressible
+incompressibility
 incrementing
 indenter
 indenters
@ -1761,6 +1764,7 @@ Kadiri
 Kai
 Kalia
 Kamberaj
+Kamrin
 Kantorovich
 Kapfer
 Kapil
@ -2531,6 +2535,7 @@ Nevery
 Nevins
 newfile
 Newns
+newstep
 newtype
 nextsort
 Neyts
@ -3380,6 +3385,7 @@ Schilfgarde
 Schimansky
 Schiotz
 Schlitter
+Schmerler
 Schmid
 Schnieders
 Schoen
@ -3729,6 +3735,7 @@ tgnpt
 tgnvt
 th
 Thakkar
+Thakur
 Thaokar
 thb
 thei
@ -3767,6 +3774,7 @@ Tigran
 Tij
 Tildesley
 Timan
+timeflag
 timeI
 timespan
 timestamp
@ -3829,6 +3837,7 @@ Tref
 Tretyakov
 tri
 triangleflag
+triaxial
 Tribello
 triclinic
 Triclinic
@ -4042,6 +4051,7 @@ VMDARCH
 VMDHOME
 vn
 Voigt
+Vogel
 volfactor
 Volkov
 Volpe
--- a/examples/COUPLE/plugin/liblammpsplugin.c
+++ b/examples/COUPLE/plugin/liblammpsplugin.c
@ -118,6 +118,9 @@ liblammpsplugin_t *liblammpsplugin_load(const char *lib)
  ADDSYM(set_internal_variable);
  ADDSYM(variable_info);
  ADDSYM(eval);
+  ADDSYM(clearstep_compute);
+  ADDSYM(addstep_compute);
+  ADDSYM(addstep_compute_all);

  ADDSYM(gather_atoms);
  ADDSYM(gather_atoms_concat);
--- a/examples/COUPLE/plugin/liblammpsplugin.h
+++ b/examples/COUPLE/plugin/liblammpsplugin.h
@ -164,6 +164,9 @@ struct _liblammpsplugin {
  int (*set_internal_variable)(void *, const char *, double);
  int (*variable_info)(void *, int, char *, int);
  double (*eval)(void *, const char *);
+  void (*clearstep_compute)(void *);
+  void (*addstep_compute)(void *, void *);
+  void (*addstep_compute_all)(void *, void *);

  void (*gather_atoms)(void *, const char *, int, int, void *);
  void (*gather_atoms_concat)(void *, const char *, int, int, void *);
--- a/examples/COUPLE/plugin/simple.c
+++ b/examples/COUPLE/plugin/simple.c
@ -21,6 +21,7 @@

 #include <mpi.h>

+#include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@ -63,7 +64,7 @@ int main(int narg, char **arg)
  nprocs_lammps = atoi(arg[1]);
  if (nprocs_lammps > nprocs) {
    if (me == 0)
-      printf("ERROR: LAMMPS cannot use more procs than available\n");
+      printf("ERROR: LAMMPS cannot use more procs than available: %d\n", nprocs);
    MPI_Abort(MPI_COMM_WORLD,1);
  }

@ -76,7 +77,7 @@ int main(int narg, char **arg)
  if (me == 0) {
    fp = fopen(arg[2],"r");
    if (fp == NULL) {
-      printf("ERROR: Could not open LAMMPS input script\n");
+      printf("ERROR: Could not open LAMMPS input script %s: %s\n", arg[2], strerror(errno));
      MPI_Abort(MPI_COMM_WORLD,1);
    }
  }
@ -87,9 +88,10 @@ int main(int narg, char **arg)
     all LAMMPS procs call lammps_command() on the line */

  if (lammps == 1) {
+    errno = 0;
    plugin = liblammpsplugin_load(arg[3]);
    if (plugin == NULL) {
-      if (me == 0) printf("ERROR: Could not load shared LAMMPS library\n");
+      if (me == 0) printf("ERROR: Could not load shared LAMMPS library file %s: %s\n", arg[3], strerror(errno));
      MPI_Abort(MPI_COMM_WORLD,1);
    }
    /* must match the plugin ABI version */
--- a/examples/PACKAGES/stressprofile/in.flat
+++ b/examples/PACKAGES/stressprofile/in.flat
@ -32,7 +32,7 @@ fix             1 all nvt temp 0.7 0.7 0.2
 #dump_modify    3 pad 3

 fix 2 all recenter NULL NULL 15 units lattice
-compute p1 all stress/cartesian z 0.5
+compute p1 all stress/cartesian z 0.5 NULL 0
 fix 3 all ave/time 100 1 100 c_p1[*] file flat.out mode vector

 thermo          50
--- a/examples/granular/in.tableting.200
+++ b/examples/granular/in.tableting.200
@ -0,0 +1,149 @@
+##################################### SIMULATION SETTINGS ###################################################
+
+atom_style sphere 1
+atom_modify map array
+comm_modify vel yes
+units si
+newton off
+neighbor      1.0e-3 bin
+neigh_modify every 10 delay 60 check no
+timestep 4e-6
+#processors 2 2 1
+
+############################## SIMULATION BOUNDING BOX AND INSERT PARTICLES #################################
+
+boundary f f f
+read_data spheres200.data
+
+#################################### ADD DIE AND ATOM PARAMETERIZATION ######################################
+
+variable atomRadius    equal 0.44e-3*1.25
+variable atomDiameter  equal 2*${atomRadius}
+variable atomDensity   equal 1560
+variable atomMassAvg   equal ${atomDensity}*4.0/3.0*PI*${atomRadius}^3.0
+variable dieRadius     equal 4e-3
+variable dieHeight     equal 1e-2
+
+############################## PARTICLE MATERIAL PROPERTIES AND FORCE MODEL ##################################
+
+pair_style granular
+
+# mdr = E, nu, Y, gamma, psi_b, CoR
+variable YoungsModulus      equal 5e6
+variable YieldStress        equal 1.9e5
+variable PoissonsRatio      equal 0.4
+variable SurfaceEnergy      equal 2
+variable SurfaceEnergyWall  equal 0.0
+variable CoR                equal 0.5
+variable psi_b              equal 0.5
+
+# linear_history = k_t, x_gammat, mu_s
+variable kt         equal 2/7*${YoungsModulus}*${atomRadius}
+variable kt_wall    equal 2/7*${YoungsModulus}*${atomRadius}
+variable xgammat    equal 0.0 
+variable mu_s       equal 0.7
+variable mu_s_wall  equal 0.1
+
+# sds = mu_roll, k_roll, gamma_roll
+variable mu_roll     equal 0.6
+variable k_roll      equal 2.25*${mu_roll}*${mu_roll}*${YoungsModulus}*${atomRadius}
+variable gamma_roll  equal 0.0
+
+pair_coeff * * mdr ${YoungsModulus} ${PoissonsRatio} ${YieldStress} ${SurfaceEnergy} ${psi_b} ${CoR} tangential linear_history ${kt} ${xgammat} ${mu_s} rolling sds ${k_roll} ${gamma_roll} ${mu_roll} damping none
+
+######################################### ADD DIE AND PUNCH WALLS ############################################
+
+variable disp_upper      equal 0.0
+variable disp_lower      equal 0.0
+
+variable wall_contact_string string "granular mdr ${YoungsModulus} ${PoissonsRatio} ${YieldStress} ${SurfaceEnergyWall} ${psi_b} ${CoR} tangential linear_history ${kt_wall} ${xgammat} ${mu_s_wall} rolling sds ${k_roll} ${gamma_roll} ${mu_roll} damping none"
+
+variable dieHeight2 equal 2*${dieHeight}
+
+region lowerPunch plane 0 0 0 0 0 1 side in units box move NULL NULL v_disp_lower units box
+region upperPunch plane 0 0 ${dieHeight} 0 0 -1 side in move NULL NULL v_disp_upper units box
+region die cylinder z 0 0 ${dieRadius} 0 ${dieHeight2} side in units box
+
+fix lowerPunch all wall/gran/region ${wall_contact_string} region lowerPunch contacts
+fix upperPunch all wall/gran/region ${wall_contact_string} region upperPunch contacts
+fix die all wall/gran/region ${wall_contact_string} region die contacts
+
+compute avgUpperPunchForce all reduce sum f_upperPunch[4]
+variable avgUpperPunchForce equal c_avgUpperPunchForce
+compute avgLowerPunchForce all reduce sum f_lowerPunch[4]
+variable avgLowerPunchForce equal c_avgLowerPunchForce
+
+fix printFD all print 1 "${disp_upper} ${avgUpperPunchForce} ${avgLowerPunchForce}" file punch_force_disp_tableting200.csv screen no
+
+##################################### INTEGRATION AND GRAVITY #################################################
+
+fix 1 all nve/sphere
+fix grav all gravity 9.81 vector 0 0 -1
+
+########################################### SCREEN OUTPUT  ####################################################
+
+compute       1 all erotate/sphere
+thermo_style  custom dt step atoms ke vol v_disp_upper
+thermo        100
+thermo_modify lost ignore norm no
+
+##################################### SET UP DUMP OUTPUTS  ####################################################
+
+compute ke all ke/atom
+variable output_rate equal round(1e-3/dt)
+
+run 0
+
+compute sigmaxx all property/atom d_sigmaxx
+compute sigmayy all property/atom d_sigmayy
+compute sigmazz all property/atom d_sigmazz
+compute Velas all property/atom d_Velas
+
+compute sigmaxx_ave all reduce ave c_sigmaxx
+compute sigmayy_ave all reduce ave c_sigmayy
+compute sigmazz_ave all reduce ave c_sigmazz
+compute Velas_sum all reduce sum c_Velas
+
+variable sxx_ave equal c_sigmaxx_ave
+variable syy_ave equal c_sigmayy_ave
+variable szz_ave equal c_sigmazz_ave
+variable Vparticles equal c_Velas_sum
+
+fix log all print 1 "${sxx_ave} ${syy_ave} ${szz_ave} ${Vparticles}" file average_normal_stresses_tableting200.csv screen no
+dump dumpParticles all custom ${output_rate} tableting200.dump id type mass diameter x y z vx vy vz fx fy fz c_ke c_sigmaxx c_sigmayy c_sigmazz
+#dump dumpParticlesVTK all vtk ${output_rate} post/particles_*.vtk id x y z fx fy fz vx vy vz c_ke radius c_sigmaxx c_sigmayy c_sigmazz
+
+############################################## RUN SIMULATION #################################################
+
+variable upper_punch_stroke  equal 0.6733*${dieHeight}
+variable vel_upper           equal 0.25
+
+variable settling_steps      equal round(0.02/dt)
+variable compression_steps   equal 2*round(${upper_punch_stroke}/${vel_upper}/dt)
+variable ejection_steps      equal ${compression_steps}
+variable free_float_steps    equal round(0.02/dt)
+
+##### SETTLING #####
+
+run ${settling_steps}
+
+##### Compression & Release #####
+
+variable punch_frequency  equal PI/2/(dt*${compression_steps}/2)
+variable disp_upper       equal -${upper_punch_stroke}*sin(${punch_frequency}*elapsed*dt)
+variable short_release    equal round(${compression_steps}*1.0)
+run ${short_release}
+
+##### EJECTION #####
+
+variable punch_frequency  equal PI/2/(dt*${ejection_steps})
+variable disp_lower       equal ${dieHeight}*sin(${punch_frequency}*elapsed*dt)
+variable disp_upper       equal 0.9*v_disp_lower
+run ${ejection_steps}
+
+##### FREE FLOAT #####
+
+variable disp_lower  equal ${dieHeight}
+variable disp_upper  equal ${dieHeight}*0.9
+variable max_disp    equal ${dieRadius}*0.75
+run ${free_float_steps}
--- a/examples/granular/in.triaxial.compaction.12
+++ b/examples/granular/in.triaxial.compaction.12
@ -0,0 +1,109 @@
+############################### SIMULATION SETTINGS ###################################################
+
+atom_style      sphere 1
+atom_modify     map array 
+comm_modify vel yes
+units           si
+newton          off
+neighbor        2 bin
+neigh_modify    delay 0
+timestep        1e-6
+
+##################### SIMULATION BOUNDING BOX, INSERT PARTICLES, AND INTEGRATION #######################
+
+boundary f f f
+read_data spheres12.data
+fix integr all nve/sphere
+
+# create pair group for contact area outputs
+group particles_1_12 id 1 12
+
+########################### PARTICLE MATERIAL PROPERTIES AND FORCE MODEL ###############################
+
+variable atomRadius equal 0.5
+
+pair_style granular
+
+# mdr = E, nu, Y, gamma, psi_b, CoR 
+variable YoungsModulus  equal 1e9
+variable PoissonsRatio  equal 0.3
+variable YieldStress    equal 50e6
+variable SurfaceEnergy  equal 0.0
+variable psi_b          equal 0.5
+variable CoR            equal 0.5
+
+# linear_history = k_t, x_gamma,t, mu_s
+variable kt       equal 2/7*${YoungsModulus}*${atomRadius}
+variable xgammat  equal 0.0
+variable mu_s     equal 0.5
+
+pair_coeff * * mdr ${YoungsModulus} ${PoissonsRatio} ${YieldStress} ${SurfaceEnergy} ${psi_b} ${CoR} tangential linear_history ${kt} ${xgammat} ${mu_s} damping none
+
+######################################### ADD IN PLANES ################################################
+
+variable boxWidth equal 3
+variable halfBoxWidth equal ${boxWidth}/2
+
+variable plane_disp equal 0.0
+variable plane_disp_neg equal 0.0
+
+region plane_yz_pos plane ${halfBoxWidth} 0 0 -1 0 0 side in move v_plane_disp_neg NULL NULL units box
+region plane_yz_neg plane -${halfBoxWidth} 0 0 1 0 0 side in move v_plane_disp NULL NULL units box
+region plane_xz_pos plane 0 ${halfBoxWidth} 0 0 -1 0 side in move NULL v_plane_disp_neg NULL units box
+region plane_xz_neg plane 0 -${halfBoxWidth} 0 0 1 0 side in move NULL v_plane_disp NULL units box
+region plane_xy_pos plane 0 0 ${halfBoxWidth} 0 0 -1  side in move NULL NULL v_plane_disp_neg units box
+region plane_xy_neg plane 0 0 -${halfBoxWidth} 0 0 1 side in move NULL NULL v_plane_disp units box
+
+variable wall_contact_string string "granular mdr ${YoungsModulus} ${PoissonsRatio} ${YieldStress} ${SurfaceEnergy} ${psi_b} ${CoR} tangential linear_history ${kt} ${xgammat} ${mu_s} damping none"
+
+fix plane_yz_pos all wall/gran/region ${wall_contact_string} region plane_yz_pos contacts
+fix plane_yz_neg all wall/gran/region ${wall_contact_string} region plane_yz_neg contacts
+fix plane_xz_pos all wall/gran/region ${wall_contact_string} region plane_xz_pos contacts
+fix plane_xz_neg all wall/gran/region ${wall_contact_string} region plane_xz_neg contacts
+fix plane_xy_pos all wall/gran/region ${wall_contact_string} region plane_xy_pos contacts
+fix plane_xy_neg all wall/gran/region ${wall_contact_string} region plane_xy_neg contacts
+
+compute plane_xy_neg_force all reduce sum f_plane_xy_neg[4]
+variable plane_xy_neg_force equal c_plane_xy_neg_force
+
+compute plane_xz_neg_force all reduce sum f_plane_xz_neg[3]
+variable plane_xz_neg_force equal c_plane_xz_neg_force
+
+compute plane_yz_neg_force all reduce sum f_plane_yz_neg[2]
+variable plane_yz_neg_force equal c_plane_yz_neg_force
+
+fix print1 all print 1 "${plane_disp} ${plane_xy_neg_force} ${plane_xz_neg_force} ${plane_yz_neg_force}" file force_disp_triaxial12.csv screen no
+
+######################################## SCREEN OUTPUT  #################################################### 
+
+compute       1 all erotate/sphere
+thermo_style  custom dt step atoms ke c_1 vol 
+thermo        100
+thermo_modify lost ignore norm no
+
+##################################### DEFINE WALL MOVEMENT  #################################################
+
+variable disp_max equal 0.499
+variable ddisp equal 0.00001
+variable compression_steps equal round(${disp_max}/${ddisp})
+variable output_rate equal round(${compression_steps}/100)
+
+##################################### SET UP DUMP OUTPUTS  ####################################################
+ 
+dump dumpParticles all custom ${output_rate} triaxial_compaction_12.dump id type mass x y z vx vy vz fx fy fz radius
+#dump   dmp all vtk ${output_rate} post/triaxial12particles_*.vtk id type mass x y z vx vy vz fx fy fz radius
+
+#################################### COMPRESS THE PARTICLES  ##################################################
+
+run 0
+
+# print out contact area evolution for particles 1 and 12
+compute Ac_1_12 particles_1_12 pair/local p13 cutoff radius
+compute Ac_1_12_sum particles_1_12 reduce sum c_Ac_1_12 inputs local
+variable Ac_1_12 equal c_Ac_1_12_sum
+fix logArea all print 100 "${plane_disp} ${Ac_1_12}" file pair_1_12_contact_area_triaxial12.csv screen no
+
+variable plane_disp equal ${ddisp}*elapsed
+variable plane_disp_neg equal -${ddisp}*elapsed
+
+run ${compression_steps}
--- a/examples/granular/spheres12.data
+++ b/examples/granular/spheres12.data
@ -0,0 +1,23 @@
+#LAMMPS data file created by matlab.
+12 atoms
+
+1 atom types
+
+-10.0000000000 10.0000000000 xlo xhi
+-10.0000000000 10.0000000000 ylo yhi
+-10.0000000000 10.0000000000 zlo zhi
+
+Atoms
+
+1 1 0.8000000000 1000.0000000000 0.0717535226 -0.2092222842 0.3662146798
+2 1 1.2000000000 1000.0000000000 -0.8233763986 -0.7426114800 -0.8263932264
+3 1 0.8000000000 1000.0000000000 -1.0685863278 -0.4494609702 0.2196698078
+4 1 0.8000000000 1000.0000000000 0.5829432471 -1.0098803839 -0.7607543861
+5 1 0.8000000000 1000.0000000000 -0.8658471132 0.6951192569 0.0107556658
+6 1 1.2000000000 1000.0000000000 0.3966456126 0.7215053869 -0.7540113087
+7 1 1.2000000000 1000.0000000000 0.7316242921 0.8996483982 0.6751483031
+8 1 1.0000000000 1000.0000000000 0.6267527768 -0.8419367233 0.6964197101
+9 1 0.8000000000 1000.0000000000 -0.0409043189 -0.1452314035 -1.0102948313
+10 1 0.8000000000 1000.0000000000 -0.9495107709 0.6760151650 -0.9220534482
+11 1 1.0000000000 1000.0000000000 -0.7488486472 0.2188003421 0.7892021020
+12 1 1.2000000000 1000.0000000000 0.8968590780 -0.2350366437 -0.2006719701
--- a/examples/granular/spheres200.data
+++ b/examples/granular/spheres200.data
@ -0,0 +1,211 @@
+#LAMMPS data file created by matlab.
+200 atoms
+
+1 atom types
+
+-0.005000 0.005000 xlo xhi
+-0.005000 0.005000 ylo yhi
+-0.001000 0.020000 zlo zhi
+
+Atoms
+
+1 1 0.001206 1560.000000 -0.000938 0.000556 0.000883
+2 1 0.000953 1560.000000 -0.002626 -0.000145 0.002778
+3 1 0.001035 1560.000000 -0.000434 0.000172 0.008458
+4 1 0.001225 1560.000000 -0.003126 -0.000604 0.004986
+5 1 0.001119 1560.000000 0.000772 0.002972 0.002568
+6 1 0.001243 1560.000000 -0.000363 0.001184 0.004927
+7 1 0.001173 1560.000000 0.000218 0.000243 0.005475
+8 1 0.000937 1560.000000 0.000033 0.000029 0.003141
+9 1 0.001055 1560.000000 -0.001660 0.001975 0.008611
+10 1 0.000938 1560.000000 -0.001818 0.002352 0.002534
+11 1 0.000990 1560.000000 0.001592 0.000435 0.004416
+12 1 0.000927 1560.000000 -0.001659 -0.000004 0.005901
+13 1 0.001272 1560.000000 0.002972 0.000553 0.007291
+14 1 0.001226 1560.000000 0.002090 0.000983 0.001406
+15 1 0.000957 1560.000000 0.002241 -0.001608 0.001304
+16 1 0.001020 1560.000000 -0.001944 0.001290 0.002030
+17 1 0.001289 1560.000000 -0.002256 -0.001173 0.003474
+18 1 0.000998 1560.000000 0.000771 0.002127 0.000906
+19 1 0.000927 1560.000000 0.000186 0.000567 0.001207
+20 1 0.001095 1560.000000 -0.000937 -0.003179 0.008173
+21 1 0.001006 1560.000000 -0.001736 0.000751 0.004618
+22 1 0.001037 1560.000000 0.000784 0.001844 0.002380
+23 1 0.001297 1560.000000 0.000234 -0.001597 0.008560
+24 1 0.001017 1560.000000 0.002454 -0.000505 0.001171
+25 1 0.001110 1560.000000 -0.000803 -0.000415 0.003714
+26 1 0.001192 1560.000000 0.002283 0.000648 0.003048
+27 1 0.000992 1560.000000 -0.000065 -0.000545 0.007062
+28 1 0.001116 1560.000000 0.002174 -0.001463 0.005830
+29 1 0.001258 1560.000000 0.001602 0.001853 0.007246
+30 1 0.001055 1560.000000 -0.001535 -0.002770 0.007196
+31 1 0.000958 1560.000000 -0.000438 -0.000260 0.004709
+32 1 0.001188 1560.000000 0.000339 -0.000355 0.009171
+33 1 0.001166 1560.000000 0.002513 -0.001215 0.004434
+34 1 0.000907 1560.000000 0.001905 -0.000373 0.004921
+35 1 0.001245 1560.000000 -0.000091 -0.002620 0.004150
+36 1 0.001302 1560.000000 0.003292 0.000184 0.005377
+37 1 0.001305 1560.000000 0.002099 0.001261 0.008939
+38 1 0.000988 1560.000000 0.003274 0.000136 0.003667
+39 1 0.000892 1560.000000 0.001798 -0.002104 0.008610
+40 1 0.001247 1560.000000 -0.003058 -0.000575 0.000948
+41 1 0.000900 1560.000000 -0.000258 -0.000469 0.001478
+42 1 0.000945 1560.000000 -0.001434 -0.001711 0.004610
+43 1 0.000977 1560.000000 -0.001410 0.002808 0.004963
+44 1 0.000930 1560.000000 -0.002110 -0.001362 0.006749
+45 1 0.000931 1560.000000 0.001256 -0.000876 0.000844
+46 1 0.000901 1560.000000 0.000899 -0.001189 0.005316
+47 1 0.000940 1560.000000 -0.002189 -0.000047 0.007240
+48 1 0.001217 1560.000000 -0.000108 -0.001333 0.002257
+49 1 0.001088 1560.000000 0.001364 -0.000594 0.002789
+50 1 0.001143 1560.000000 -0.000311 -0.001425 0.006092
+51 1 0.001054 1560.000000 0.002262 0.002312 0.004315
+52 1 0.001016 1560.000000 -0.000724 0.000741 0.003295
+53 1 0.001051 1560.000000 0.000527 -0.001987 0.003307
+54 1 0.000905 1560.000000 0.000827 0.001457 0.005868
+55 1 0.001195 1560.000000 -0.001176 -0.000645 0.000798
+56 1 0.001253 1560.000000 0.002583 -0.001847 0.003310
+57 1 0.000982 1560.000000 0.001551 -0.002803 0.005076
+58 1 0.000945 1560.000000 -0.000481 0.000354 0.007220
+59 1 0.001040 1560.000000 -0.002736 0.001076 0.008769
+60 1 0.000917 1560.000000 0.000826 -0.001887 0.006449
+61 1 0.000914 1560.000000 -0.001171 -0.001592 0.007266
+62 1 0.000959 1560.000000 0.000834 -0.002671 0.007105
+63 1 0.000990 1560.000000 -0.000251 -0.001327 0.004339
+64 1 0.001220 1560.000000 0.001384 0.002896 0.005874
+65 1 0.000949 1560.000000 -0.001340 -0.000608 0.007496
+66 1 0.001306 1560.000000 0.002187 0.002068 0.002629
+67 1 0.001206 1560.000000 0.000148 0.001506 0.008517
+68 1 0.001123 1560.000000 0.001288 -0.000303 0.006613
+69 1 0.001151 1560.000000 -0.000876 0.001549 0.001740
+70 1 0.001315 1560.000000 -0.001902 -0.002590 0.001344
+71 1 0.000927 1560.000000 0.002285 -0.000866 0.006900
+72 1 0.001279 1560.000000 -0.000165 0.002689 0.007449
+73 1 0.000910 1560.000000 0.001009 0.001054 0.005049
+74 1 0.001148 1560.000000 -0.002229 -0.001285 0.008736
+75 1 0.001067 1560.000000 -0.000261 -0.002945 0.002157
+76 1 0.000993 1560.000000 -0.001641 0.002272 0.007601
+77 1 0.001228 1560.000000 0.001939 -0.000214 0.008903
+78 1 0.001076 1560.000000 0.000767 0.001172 0.003556
+79 1 0.001105 1560.000000 -0.000561 0.002493 0.004214
+80 1 0.001195 1560.000000 0.002694 -0.000817 0.007949
+81 1 0.001239 1560.000000 -0.000968 -0.003145 0.006096
+82 1 0.001083 1560.000000 -0.000808 0.001813 0.006396
+83 1 0.000923 1560.000000 0.000632 -0.001437 0.001310
+84 1 0.000981 1560.000000 -0.001842 0.002774 0.006508
+85 1 0.000998 1560.000000 -0.002775 0.001616 0.001453
+86 1 0.000979 1560.000000 -0.002520 0.001715 0.007741
+87 1 0.001002 1560.000000 -0.001465 -0.001931 0.006048
+88 1 0.000958 1560.000000 0.003264 0.000707 0.001189
+89 1 0.001052 1560.000000 -0.001314 -0.000701 0.002721
+90 1 0.001096 1560.000000 0.001154 0.002129 0.004403
+91 1 0.001104 1560.000000 0.002118 0.001977 0.000794
+92 1 0.001263 1560.000000 -0.001499 -0.002764 0.003441
+93 1 0.001086 1560.000000 -0.001096 0.002514 0.001154
+94 1 0.000895 1560.000000 0.001130 0.000029 0.001045
+95 1 0.000964 1560.000000 0.000905 -0.003200 0.000542
+96 1 0.000898 1560.000000 -0.000868 0.003148 0.008306
+97 1 0.000907 1560.000000 -0.001406 0.001144 0.007862
+98 1 0.001176 1560.000000 0.001246 -0.001074 0.004327
+99 1 0.001148 1560.000000 0.001512 -0.002739 0.003346
+100 1 0.000922 1560.000000 0.001470 -0.000036 0.007695
+101 1 0.001031 1560.000000 -0.002751 0.000928 0.004124
+102 1 0.001030 1560.000000 -0.000177 -0.002370 0.005374
+103 1 0.000915 1560.000000 0.000824 0.000521 0.007070
+104 1 0.001085 1560.000000 -0.002281 -0.000023 0.009123
+105 1 0.001004 1560.000000 -0.000167 0.002610 0.008905
+106 1 0.001060 1560.000000 -0.000389 -0.002220 0.007688
+107 1 0.000920 1560.000000 -0.000483 0.003231 0.006505
+108 1 0.001122 1560.000000 0.001781 -0.001547 0.002237
+109 1 0.001172 1560.000000 -0.002650 0.000830 0.005429
+110 1 0.001137 1560.000000 -0.000030 -0.003246 0.001024
+111 1 0.001315 1560.000000 0.001470 -0.001735 0.007580
+112 1 0.001245 1560.000000 0.000481 -0.003067 0.006025
+113 1 0.000904 1560.000000 0.000632 -0.000184 0.002010
+114 1 0.000883 1560.000000 -0.001828 0.002191 0.003819
+115 1 0.000974 1560.000000 0.002167 0.001616 0.006226
+116 1 0.001150 1560.000000 0.000871 -0.002731 0.002136
+117 1 0.001312 1560.000000 -0.000326 -0.001971 0.001000
+118 1 0.000914 1560.000000 0.001020 0.000810 0.002086
+119 1 0.001136 1560.000000 -0.000101 -0.003277 0.007246
+120 1 0.000991 1560.000000 -0.001944 0.000576 0.003215
+121 1 0.001216 1560.000000 -0.000913 -0.001165 0.008857
+122 1 0.001045 1560.000000 -0.003110 0.001062 0.002973
+123 1 0.000918 1560.000000 0.000348 0.000365 0.004046
+124 1 0.001279 1560.000000 -0.000884 0.003087 0.002268
+125 1 0.001065 1560.000000 -0.002238 0.001309 0.006452
+126 1 0.001012 1560.000000 -0.002059 -0.001354 0.001935
+127 1 0.001142 1560.000000 -0.003011 0.000567 0.001739
+128 1 0.000921 1560.000000 0.001764 0.002804 0.008177
+129 1 0.001151 1560.000000 -0.003105 -0.000384 0.006602
+130 1 0.000967 1560.000000 0.000932 0.000588 0.008823
+131 1 0.000908 1560.000000 -0.001873 -0.001947 0.007825
+132 1 0.000923 1560.000000 -0.002993 0.000883 0.007425
+133 1 0.001171 1560.000000 0.003310 -0.000405 0.006558
+134 1 0.000977 1560.000000 -0.000098 -0.000180 0.000492
+135 1 0.000938 1560.000000 -0.000706 -0.000129 0.006085
+136 1 0.001008 1560.000000 -0.000256 0.002333 0.000550
+137 1 0.001073 1560.000000 0.000534 -0.000055 0.008080
+138 1 0.000890 1560.000000 0.000351 0.001695 0.007195
+139 1 0.000973 1560.000000 0.002593 0.001907 0.005394
+140 1 0.001176 1560.000000 -0.001862 -0.000534 0.004494
+141 1 0.001306 1560.000000 -0.000951 0.001053 0.009299
+142 1 0.001103 1560.000000 -0.001937 -0.002711 0.008485
+143 1 0.001262 1560.000000 -0.002947 -0.001470 0.007682
+144 1 0.000914 1560.000000 0.002047 0.000811 0.005504
+145 1 0.000954 1560.000000 0.001935 -0.002349 0.006632
+146 1 0.001003 1560.000000 0.000766 -0.002635 0.008483
+147 1 0.001137 1560.000000 0.000102 0.003195 0.004922
+148 1 0.001006 1560.000000 -0.001982 0.001014 0.000685
+149 1 0.001255 1560.000000 -0.000718 0.001939 0.003056
+150 1 0.001057 1560.000000 -0.001189 -0.001717 0.003045
+151 1 0.001228 1560.000000 0.001581 0.002926 0.003510
+152 1 0.001052 1560.000000 -0.002172 0.001949 0.004831
+153 1 0.000979 1560.000000 -0.001817 0.000291 0.002048
+154 1 0.001286 1560.000000 -0.002647 -0.001839 0.004620
+155 1 0.001085 1560.000000 -0.000081 0.000850 0.002139
+156 1 0.000990 1560.000000 -0.000081 0.002105 0.005587
+157 1 0.001043 1560.000000 0.001636 -0.000112 0.001860
+158 1 0.001309 1560.000000 0.003216 -0.000851 0.002791
+159 1 0.000913 1560.000000 0.000608 0.003148 0.006565
+160 1 0.000919 1560.000000 0.000536 -0.003106 0.003249
+161 1 0.000943 1560.000000 0.003145 -0.000528 0.008915
+162 1 0.000993 1560.000000 -0.002811 -0.000099 0.008110
+163 1 0.001125 1560.000000 0.001415 -0.002271 0.000643
+164 1 0.000919 1560.000000 -0.001406 0.000223 0.006781
+165 1 0.001040 1560.000000 0.000690 0.003193 0.008329
+166 1 0.001055 1560.000000 0.001075 0.002584 0.009093
+167 1 0.001176 1560.000000 0.000851 0.003176 0.000591
+168 1 0.001003 1560.000000 -0.001462 0.001511 0.005544
+169 1 0.001126 1560.000000 -0.000077 0.003324 0.001347
+170 1 0.001068 1560.000000 0.003110 0.000810 0.008495
+171 1 0.001011 1560.000000 -0.001661 0.000117 0.008201
+172 1 0.001066 1560.000000 -0.000359 -0.003279 0.009094
+173 1 0.001303 1560.000000 0.003066 0.001188 0.004082
+174 1 0.000983 1560.000000 0.000354 0.002261 0.003558
+175 1 0.001137 1560.000000 0.002860 -0.001571 0.009180
+176 1 0.001070 1560.000000 0.001246 -0.001279 0.009104
+177 1 0.000886 1560.000000 0.002271 -0.000316 0.003675
+178 1 0.000983 1560.000000 -0.001987 -0.002490 0.005377
+179 1 0.000939 1560.000000 0.000601 -0.000861 0.003477
+180 1 0.001177 1560.000000 0.001522 0.002902 0.001690
+181 1 0.001036 1560.000000 -0.001200 -0.002874 0.004750
+182 1 0.000898 1560.000000 -0.001705 -0.001140 0.005503
+183 1 0.001315 1560.000000 0.002732 0.001766 0.007885
+184 1 0.001318 1560.000000 -0.002909 -0.001610 0.005936
+185 1 0.001218 1560.000000 0.003213 0.000884 0.002316
+186 1 0.001234 1560.000000 -0.002394 -0.002298 0.002575
+187 1 0.001160 1560.000000 -0.003313 -0.000065 0.003625
+188 1 0.001022 1560.000000 -0.003096 -0.001048 0.002151
+189 1 0.000966 1560.000000 0.001891 -0.002093 0.004404
+190 1 0.001048 1560.000000 -0.002367 0.002338 0.000697
+191 1 0.000995 1560.000000 -0.001204 -0.001912 0.002030
+192 1 0.001136 1560.000000 -0.001152 -0.002402 0.009223
+193 1 0.001083 1560.000000 -0.002588 -0.001768 0.000753
+194 1 0.000946 1560.000000 -0.001338 -0.000741 0.006527
+195 1 0.000943 1560.000000 -0.000073 0.003254 0.003663
+196 1 0.001059 1560.000000 0.000087 0.000958 0.006388
+197 1 0.001131 1560.000000 0.001030 0.001019 0.000752
+198 1 0.001257 1560.000000 -0.001365 0.002946 0.009266
+199 1 0.000891 1560.000000 -0.000445 -0.000273 0.002382
+200 1 0.001055 1560.000000 0.001781 0.000748 0.006583
--- a/examples/multi/in.granular
+++ b/examples/multi/in.granular
@ -1,4 +1,4 @@
-# Big colloid particles and small LJ particles
+# Binary granular system

 units           lj
 atom_style      sphere
--- a/examples/snap/README.md
+++ b/examples/snap/README.md
@ -9,5 +9,11 @@ in.snap.Mo_Chen                   # SNAP linear Mo potential
 in.snap.compute                   # SNAP compute for training a linear model
 in.snap.compute.quadratic         # SNAP compute for training a quadratic model
 in.snap.scale.Ni_Zuo_JCPA2020     # SNAP linear Ni potential with thermodynamic integration (fix adapt scale)
+in.C_SNAP                         # SNAP carbon potential

 compute_snap_dgrad.py             # SNAP compute with dgradflag (dBi/dRj) for training a non-linear model
+
+in.snap.grid                      # SNAP descriptors on a grid
+in.snap.grid.triclinic            # SNAP descriptors on a grid, triclinic
+in.gaussian.grid                  # Gaussian descriptors on a grid
+
--- a/examples/snap/in.gaussian.grid
+++ b/examples/snap/in.gaussian.grid
@ -0,0 +1,68 @@
+# Demonstrate calculation of Gaussian descriptors on a grid
+# for a cell with two atoms of type 1 and type 2.
+# The output in dump.glocal shows that for grid points
+# sitting on an atom of type 1 or 2:
+# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
+# val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
+# These values are extracted to the log file
+# 
+
+variable 	nrep index 1
+variable 	a index 3.316
+variable 	ngrid index 2
+
+units		metal
+atom_modify	map hash
+
+# generate the box and atom positions using a BCC lattice
+
+variable       	nx equal ${nrep}
+variable 	ny equal ${nrep}
+variable 	nz equal ${nrep}
+
+boundary	p p p
+
+lattice		custom $a &
+		a1 1 0 0 &
+		a2 0 1 0  &
+		a3 0 0 1 &
+		basis 0 0 0 &
+		basis 0.5 0.5 0.5 &
+
+region		box block 0 ${nx} 0 ${ny} 0 ${nz}
+create_box	2 box
+create_atoms	1 box basis 1 1 basis 2 2
+
+mass 		* 180.88
+
+# define atom compute and grid compute
+
+variable 	rcutfac equal 4.67637
+variable 	radelem1 equal 0.5
+variable 	radelem2 equal 0.5
+variable	sigmaelem1 equal 0.1355
+variable	sigmaelem2 equal 0.2
+variable 	gaussian_options string &
+		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
+		
+# build zero potential to force ghost atom creation
+
+pair_style      zero ${rcutfac}
+pair_coeff      * *
+
+# define atom and grid computes
+
+compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} &
+	 	${gaussian_options}
+
+# define output
+
+dump		1 all local 1000 dump.glocal c_mygridlocal[*]
+dump 		2 all custom 1000 dump.gatom id x y z
+compute		val1 all reduce max c_mygridlocal[7] inputs local
+compute		val2 all reduce max c_mygridlocal[8] inputs local
+thermo_style	custom step c_val1 c_val2
+
+# run
+
+run		0
--- a/examples/snap/in.snap.grid
+++ b/examples/snap/in.snap.grid
--- a/examples/snap/in.snap.grid.triclinic
+++ b/examples/snap/in.snap.grid.triclinic
@ -47,7 +47,6 @@ lattice		custom $a &
 		basis 0.0 0.0 0.5 &
 		spacing 1 1 1

-box 		tilt large
 region		box prism 0 ${nx} 0 ${ny} 0 ${nz} ${ny} ${nz} ${nz}
 create_box	1 box
 create_atoms	1 box
--- a/examples/snap/log.10Dec24.gaussian.grid.g++.1
+++ b/examples/snap/log.10Dec24.gaussian.grid.g++.1
@ -0,0 +1,129 @@
+LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-59-g16e0a7788a)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
+  using 1 OpenMP thread(s) per MPI task
+# Demonstrate calculation of Gaussian descriptors on a grid
+# for a cell with two atoms of type 1 and type 2.
+# The output in dump.glocal shows that for grid points
+# sitting on an atom of type 1 or 2:
+# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
+# val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
+# These values are extracted to the log file
+#
+
+variable 	nrep index 1
+variable 	a index 3.316
+variable 	ngrid index 2
+
+units		metal
+atom_modify	map hash
+
+# generate the box and atom positions using a BCC lattice
+
+variable       	nx equal ${nrep}
+variable       	nx equal 1
+variable 	ny equal ${nrep}
+variable 	ny equal 1
+variable 	nz equal ${nrep}
+variable 	nz equal 1
+
+boundary	p p p
+
+lattice		custom $a 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
+lattice		custom 3.316 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
+Lattice spacing in x,y,z = 3.316 3.316 3.316
+region		box block 0 ${nx} 0 ${ny} 0 ${nz}
+region		box block 0 1 0 ${ny} 0 ${nz}
+region		box block 0 1 0 1 0 ${nz}
+region		box block 0 1 0 1 0 1
+create_box	2 box
+Created orthogonal box = (0 0 0) to (3.316 3.316 3.316)
+  1 by 1 by 1 MPI processor grid
+create_atoms	1 box basis 1 1 basis 2 2
+Created 2 atoms
+  using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316)
+  create_atoms CPU = 0.001 seconds
+
+mass 		* 180.88
+
+# define atom compute and grid compute
+
+variable 	rcutfac equal 4.67637
+variable 	radelem1 equal 0.5
+variable 	radelem2 equal 0.5
+variable	sigmaelem1 equal 0.1355
+variable	sigmaelem2 equal 0.2
+variable 	gaussian_options string 		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
+4.67637 ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 ${radelem2} ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 0.5 ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 0.5 0.1355 ${sigmaelem2}
+4.67637 0.5 0.5 0.1355 0.2
+
+# build zero potential to force ghost atom creation
+
+pair_style      zero ${rcutfac}
+pair_style      zero 4.67637
+pair_coeff      * *
+
+# define atom and grid computes
+
+compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 ${ngrid} ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	4.67637 0.5 0.5 0.1355 0.2
+
+# define output
+
+dump		1 all local 1000 dump.glocal c_mygridlocal[*]
+dump 		2 all custom 1000 dump.gatom id x y z
+compute		val1 all reduce max c_mygridlocal[7] inputs local
+compute		val2 all reduce max c_mygridlocal[8] inputs local
+thermo_style	custom step c_val1 c_val2
+
+# run
+
+run		0
+WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
+Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 6.67637
+  ghost atom cutoff = 6.67637
+  binsize = 3.338185, bins = 1 1 1
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair zero, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 3.492 | 3.492 | 3.492 Mbytes
+   Step         c_val1         c_val2    
+         0   25.521859      7.9367045    
+Loop time of 1.088e-06 on 1 procs for 0 steps with 2 atoms
+
+183.8% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 1.088e-06  |            |       |100.00
+
+Nlocal:              2 ave           2 max           2 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:            339 ave         339 max         339 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:             64 ave          64 max          64 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 64
+Ave neighs/atom = 32
+Neighbor list builds = 0
+Dangerous builds = 0
+Total wall time: 0:00:00
--- a/examples/snap/log.10Dec24.gaussian.grid.g++.4
+++ b/examples/snap/log.10Dec24.gaussian.grid.g++.4
@ -0,0 +1,130 @@
+LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-59-g16e0a7788a)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
+  using 1 OpenMP thread(s) per MPI task
+# Demonstrate calculation of Gaussian descriptors on a grid
+# for a cell with two atoms of type 1 and type 2.
+# The output in dump.glocal shows that for grid points
+# sitting on an atom of type 1 or 2:
+# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
+# val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
+# These values are extracted to the log file
+#
+
+variable 	nrep index 1
+variable 	a index 3.316
+variable 	ngrid index 2
+
+units		metal
+atom_modify	map hash
+
+# generate the box and atom positions using a BCC lattice
+
+variable       	nx equal ${nrep}
+variable       	nx equal 1
+variable 	ny equal ${nrep}
+variable 	ny equal 1
+variable 	nz equal ${nrep}
+variable 	nz equal 1
+
+boundary	p p p
+
+lattice		custom $a 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
+lattice		custom 3.316 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
+Lattice spacing in x,y,z = 3.316 3.316 3.316
+region		box block 0 ${nx} 0 ${ny} 0 ${nz}
+region		box block 0 1 0 ${ny} 0 ${nz}
+region		box block 0 1 0 1 0 ${nz}
+region		box block 0 1 0 1 0 1
+create_box	2 box
+Created orthogonal box = (0 0 0) to (3.316 3.316 3.316)
+  1 by 2 by 2 MPI processor grid
+create_atoms	1 box basis 1 1 basis 2 2
+Created 2 atoms
+  using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316)
+  create_atoms CPU = 0.001 seconds
+
+mass 		* 180.88
+
+# define atom compute and grid compute
+
+variable 	rcutfac equal 4.67637
+variable 	radelem1 equal 0.5
+variable 	radelem2 equal 0.5
+variable	sigmaelem1 equal 0.1355
+variable	sigmaelem2 equal 0.2
+variable 	gaussian_options string 		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
+4.67637 ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 ${radelem2} ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 0.5 ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 0.5 0.1355 ${sigmaelem2}
+4.67637 0.5 0.5 0.1355 0.2
+
+# build zero potential to force ghost atom creation
+
+pair_style      zero ${rcutfac}
+pair_style      zero 4.67637
+pair_coeff      * *
+
+# define atom and grid computes
+
+compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 ${ngrid} ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	4.67637 0.5 0.5 0.1355 0.2
+
+# define output
+
+dump		1 all local 1000 dump.glocal c_mygridlocal[*]
+dump 		2 all custom 1000 dump.gatom id x y z
+compute		val1 all reduce max c_mygridlocal[7] inputs local
+compute		val2 all reduce max c_mygridlocal[8] inputs local
+thermo_style	custom step c_val1 c_val2
+
+# run
+
+run		0
+WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
+Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 6.67637
+  ghost atom cutoff = 6.67637
+  binsize = 3.338185, bins = 1 1 1
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair zero, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d
+      bin: standard
+WARNING: Proc sub-domain size < neighbor skin, could lead to lost atoms (src/domain.cpp:1202)
+Per MPI rank memory allocation (min/avg/max) = 3.522 | 3.523 | 3.524 Mbytes
+   Step         c_val1         c_val2    
+         0   25.521859      7.9367045    
+Loop time of 2.238e-06 on 4 procs for 0 steps with 2 atoms
+
+89.4% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 2.238e-06  |            |       |100.00
+
+Nlocal:            0.5 ave           1 max           0 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Nghost:          274.5 ave         275 max         274 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Neighs:             16 ave          40 max           0 min
+Histogram: 2 0 0 0 0 0 1 0 0 1
+
+Total # of neighbors = 64
+Ave neighs/atom = 32
+Neighbor list builds = 0
+Dangerous builds = 0
+Total wall time: 0:00:00
--- a/fortran/lammps.f90
+++ b/fortran/lammps.f90
@ -127,6 +127,16 @@ MODULE LIBLAMMPS
    PROCEDURE :: set_string_variable    => lmp_set_string_variable
    PROCEDURE :: set_internal_variable  => lmp_set_internal_variable
    PROCEDURE :: eval                   => lmp_eval
+
+    PROCEDURE :: clearstep_compute      => lmp_clearstep_compute
+    PROCEDURE, PRIVATE :: lmp_addstep_compute_smallint
+    PROCEDURE, PRIVATE :: lmp_addstep_compute_bigint
+    GENERIC :: addstep_compute          => lmp_addstep_compute_smallint, lmp_addstep_compute_bigint
+    PROCEDURE, PRIVATE :: lmp_addstep_compute_all_smallint
+    PROCEDURE, PRIVATE :: lmp_addstep_compute_all_bigint
+    GENERIC :: addstep_compute_all      => lmp_addstep_compute_all_smallint, &
+                                           lmp_addstep_compute_all_bigint
+
    PROCEDURE, PRIVATE :: lmp_gather_atoms_int
    PROCEDURE, PRIVATE :: lmp_gather_atoms_double
    GENERIC   :: gather_atoms           => lmp_gather_atoms_int, &
@ -626,6 +636,24 @@ MODULE LIBLAMMPS
      REAL(c_double) :: lammps_eval
    END FUNCTION lammps_eval

+    SUBROUTINE lammps_clearstep_compute(handle) BIND(C)
+      IMPORT :: c_ptr
+      IMPLICIT NONE
+      TYPE(c_ptr), VALUE :: handle
+    END SUBROUTINE lammps_clearstep_compute
+
+    SUBROUTINE lammps_addstep_compute(handle, step) BIND(C)
+      IMPORT :: c_ptr
+      IMPLICIT NONE
+      TYPE(c_ptr), VALUE :: handle, step
+    END SUBROUTINE lammps_addstep_compute
+
+    SUBROUTINE lammps_addstep_compute_all(handle, step) BIND(C)
+      IMPORT :: c_ptr
+      IMPLICIT NONE
+      TYPE(c_ptr), VALUE :: handle, step
+    END SUBROUTINE lammps_addstep_compute_all
+
    SUBROUTINE lammps_gather_atoms(handle, name, TYPE, count, DATA) BIND(C)
      IMPORT :: c_int, c_ptr
      IMPLICIT NONE
@ -1846,6 +1874,80 @@ CONTAINS
    CALL lammps_free(Cexpr)
  END FUNCTION lmp_eval

+  ! equivalent subroutine to lammps_clearstep_compute
+  SUBROUTINE lmp_clearstep_compute(self)
+    CLASS(lammps), INTENT(IN) :: self
+    CALL lammps_clearstep_compute(self%handle)
+  END SUBROUTINE lmp_clearstep_compute
+
+  ! equivalent subroutine to lammps_addstep_compute
+  SUBROUTINE lmp_addstep_compute_bigint(self, nextstep)
+    CLASS(lammps), INTENT(IN) :: self
+    INTEGER(kind=8), INTENT(IN) :: nextstep
+    INTEGER(c_int), TARGET :: smallstep
+    INTEGER(c_int64_t), TARGET :: bigstep
+    TYPE(c_ptr) :: ptrstep
+    IF (SIZE_BIGINT == 4_c_int) THEN
+        smallstep = INT(nextstep,kind=c_int)
+        ptrstep = C_LOC(smallstep)
+    ELSE
+        bigstep = nextstep
+        ptrstep = C_LOC(bigstep)
+    END IF
+    CALL lammps_addstep_compute(self%handle, ptrstep)
+  END SUBROUTINE lmp_addstep_compute_bigint
+
+  ! equivalent subroutine to lammps_addstep_compute
+  SUBROUTINE lmp_addstep_compute_smallint(self, nextstep)
+    CLASS(lammps), INTENT(IN) :: self
+    INTEGER(kind=4), INTENT(IN) :: nextstep
+    INTEGER(c_int), TARGET :: smallstep
+    INTEGER(c_int64_t), TARGET :: bigstep
+    TYPE(c_ptr) :: ptrstep
+    IF (SIZE_BIGINT == 4_c_int) THEN
+        smallstep = nextstep
+        ptrstep = C_LOC(smallstep)
+    ELSE
+        bigstep = nextstep
+        ptrstep = C_LOC(bigstep)
+    END IF
+    CALL lammps_addstep_compute(self%handle, ptrstep)
+  END SUBROUTINE lmp_addstep_compute_smallint
+
+  ! equivalent subroutine to lammps_addstep_compute_all
+  SUBROUTINE lmp_addstep_compute_all_bigint(self, nextstep)
+    CLASS(lammps), INTENT(IN) :: self
+    INTEGER(kind=8), INTENT(IN) :: nextstep
+    INTEGER(c_int), TARGET :: smallstep
+    INTEGER(c_int64_t), TARGET :: bigstep
+    TYPE(c_ptr) :: ptrstep
+    IF (SIZE_BIGINT == 4_c_int) THEN
+        smallstep = INT(nextstep,kind=c_int)
+        ptrstep = C_LOC(smallstep)
+    ELSE
+        bigstep = nextstep
+        ptrstep = C_LOC(bigstep)
+    END IF
+    CALL lammps_addstep_compute_all(self%handle, ptrstep)
+  END SUBROUTINE lmp_addstep_compute_all_bigint
+
+  ! equivalent subroutine to lammps_addstep_compute_all
+  SUBROUTINE lmp_addstep_compute_all_smallint(self, nextstep)
+    CLASS(lammps), INTENT(IN) :: self
+    INTEGER(kind=4), INTENT(IN) :: nextstep
+    INTEGER(c_int), TARGET :: smallstep
+    INTEGER(c_int64_t), TARGET :: bigstep
+    TYPE(c_ptr) :: ptrstep
+    IF (SIZE_BIGINT == 4_c_int) THEN
+        smallstep = nextstep
+        ptrstep = C_LOC(smallstep)
+    ELSE
+        bigstep = nextstep
+        ptrstep = C_LOC(bigstep)
+    END IF
+    CALL lammps_addstep_compute_all(self%handle, ptrstep)
+  END SUBROUTINE lmp_addstep_compute_all_smallint
+
  ! equivalent function to lammps_gather_atoms (for integers)
  SUBROUTINE lmp_gather_atoms_int(self, name, count, data)
    CLASS(lammps), INTENT(IN) :: self
--- a/lib/plumed/Install.py
+++ b/lib/plumed/Install.py
@ -19,7 +19,7 @@ parser = ArgumentParser(prog='Install.py',
 # Note: must also adjust check for supported API versions in
 # fix_plumed.cpp when version changes from v2.n.x to v2.n+1.y

-version = "2.9.2"
+version = "2.9.3"
 mode = "static"

 # help message
@ -52,6 +52,7 @@ checksums = { \
        '2.9.0' : '661eabeebee05cf84bbf9dc23d7d5f46', \
        '2.9.1' : 'c3b2d31479c1e9ce211719d40e9efbd7', \
        '2.9.2' : '04862602a372c1013bdfee2d6d03bace', \
+        '2.9.3' : 'ee1249805fe94bccee17d10610d3f6f1', \
        }

 # parse and process arguments
--- a/python/lammps/core.py
+++ b/python/lammps/core.py
@ -422,6 +422,10 @@ class lammps(object):
    self.lib.lammps_extract_variable_datatype.argtypes = [c_void_p, c_char_p]
    self.lib.lammps_extract_variable_datatype.restype = c_int

+    self.lib.lammps_clearstep_compute.argtype = [c_void_p]
+    self.lib.lammps_addstep_compute.argtype = [c_void_p, c_void_p]
+    self.lib.lammps_addstep_compute_all.argtype = [c_void_p, c_void_p]
+
    self.lib.lammps_eval.argtypes = [c_void_p, c_char_p]
    self.lib.lammps_eval.restype = c_double

@ -1594,6 +1598,26 @@ class lammps(object):

  # -------------------------------------------------------------------------

+  def clearstep_compute(self, nextstep):
+    with ExceptionCheck(self):
+      return self.lib.lammps_clearstep_compute(self.lmp)
+
+  # -------------------------------------------------------------------------
+
+  def addstep_compute(self, nextstep):
+    with ExceptionCheck(self):
+      nextstep = self.c_bigint(nextstep)
+      return self.lib.lammps_addstep_compute(self.lmp, POINTER(nextstep))
+
+  # -------------------------------------------------------------------------
+
+  def addstep_compute_all(self, nextstep):
+    with ExceptionCheck(self):
+      nextstep = self.c_bigint(nextstep)
+      return self.lib.lammps_addstep_compute_all(self.lmp, POINTER(nextstep))
+
+  # -------------------------------------------------------------------------
+
  def flush_buffers(self):
    """Flush output buffers

@ -1694,7 +1718,6 @@ class lammps(object):

    with ExceptionCheck(self):
      return self.lib.lammps_eval(self.lmp, newexpr)
-    return None

  # -------------------------------------------------------------------------

--- a/src/.gitignore
+++ b/src/.gitignore
@ -252,6 +252,8 @@
 /*rheo*.cpp
 /*rheo*.h

+/compute_gaussian_grid_local.cpp
+/compute_gaussian_grid_local.h
 /compute_grid.cpp
 /compute_grid.h
 /compute_grid_local.cpp
@ -849,6 +851,8 @@
 /fix_ffl.h
 /fix_filter_corotate.cpp
 /fix_filter_corotate.h
+/fix_granular_mdr.cpp
+/fix_granular_mdr.h
 /fix_viscosity.cpp
 /fix_viscosity.h
 /fix_ehex.cpp
@ -1277,6 +1281,10 @@
 /pair_hbond_dreiding_lj.h
 /pair_hbond_dreiding_morse.cpp
 /pair_hbond_dreiding_morse.h
+/pair_hbond_dreiding_lj_angleoffset.cpp
+/pair_hbond_dreiding_lj_angleoffset.h
+/pair_hbond_dreiding_morse_angleoffset.cpp
+/pair_hbond_dreiding_morse_angleoffset.h
 /pair_hdnnp.cpp
 /pair_hdnnp.h
 /pair_ilp_graphene_hbn.cpp
--- a/src/AMOEBA/fix_amoeba_pitorsion.cpp
+++ b/src/AMOEBA/fix_amoeba_pitorsion.cpp
@ -773,9 +773,9 @@ bigint FixAmoebaPiTorsion::read_data_skip_lines(char *keyword)

 void FixAmoebaPiTorsion::write_data_header(FILE *fp, int mth)
 {
-  if (mth == 0) fmt::print(fp,"{} pitorsions\n",npitorsions);
+  if (mth == 0) utils::print(fp,"{} pitorsions\n",npitorsions);
  else if (mth == 1)
-    fmt::print(fp, "{} pitorsion types\n",npitorsion_types);
+    utils::print(fp, "{} pitorsion types\n",npitorsion_types);
 }

 /* ----------------------------------------------------------------------
--- a/src/BODY/body_nparticle.cpp
+++ b/src/BODY/body_nparticle.cpp
@ -261,22 +261,22 @@ int BodyNparticle::write_data_body(FILE *fp, double *buf)

  // atomID ninteger ndouble

-  fmt::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
+  utils::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
  m += 3;

  const int nsub = (int) ubuf(buf[m++]).i;
-  fmt::print(fp,"{}\n",nsub);
+  utils::print(fp,"{}\n",nsub);

  // inertia

-  fmt::print(fp,"{} {} {} {} {} {}\n",
+  utils::print(fp,"{} {} {} {} {} {}\n",
             buf[m+0],buf[m+1],buf[m+2],buf[m+3],buf[m+4],buf[m+5]);
  m += 6;

  // nsub vertices

  for (int i = 0; i < nsub; i++) {
-    fmt::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);
+    utils::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);
    m += 3;
  }

--- a/src/BODY/body_rounded_polygon.cpp
+++ b/src/BODY/body_rounded_polygon.cpp
@ -398,27 +398,27 @@ int BodyRoundedPolygon::write_data_body(FILE *fp, double *buf)

  // atomID ninteger ndouble

-  fmt::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
+  utils::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
  m += 3;

  const int nsub = (int) ubuf(buf[m++]).i;
-  fmt::print(fp,"{}\n",nsub);
+  utils::print(fp,"{}\n",nsub);

  // inertia

-  fmt::print(fp,"{} {} {} {} {} {}\n",
+  utils::print(fp,"{} {} {} {} {} {}\n",
             buf[m+0],buf[m+1],buf[m+2],buf[m+3],buf[m+4],buf[m+5]);
  m += 6;

  // nsub vertices

  for (int i = 0; i < nsub; i++, m+=3)
-    fmt::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);
+    utils::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);

  // rounded diameter

  double diameter = buf[m++];
-  fmt::print(fp,"{}\n",diameter);
+  utils::print(fp,"{}\n",diameter);

  return m;
 }
--- a/src/BODY/body_rounded_polyhedron.cpp
+++ b/src/BODY/body_rounded_polyhedron.cpp
@ -476,7 +476,7 @@ int BodyRoundedPolyhedron::write_data_body(FILE *fp, double *buf)

  // atomID ninteger ndouble

-  fmt::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
+  utils::print(fp,"{} {} {}\n",ubuf(buf[m]).i,ubuf(buf[m+1]).i,ubuf(buf[m+2]).i);
  m += 3;

  // nvert, nedge, nface
@ -484,27 +484,27 @@ int BodyRoundedPolyhedron::write_data_body(FILE *fp, double *buf)
  const int nsub = (int) ubuf(buf[m++]).i;
  const int nedge = (int) ubuf(buf[m++]).i;
  const int nface = (int) ubuf(buf[m++]).i;
-  fmt::print(fp,"{} {} {}\n",nsub,nedge,nface);
+  utils::print(fp,"{} {} {}\n",nsub,nedge,nface);

  // inertia

-  fmt::print(fp,"{} {} {} {} {} {}\n",
+  utils::print(fp,"{} {} {} {} {} {}\n",
             buf[m+0],buf[m+1],buf[m+2],buf[m+3],buf[m+4],buf[m+5]);
  m += 6;

  // nsub vertices

  for (int i = 0; i < nsub; i++, m+=3)
-    fmt::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);
+    utils::print(fp,"{} {} {}\n",buf[m],buf[m+1],buf[m+2]);

  // nedge 2-tuples and nface 4-tuples
  // unless nsub = 1 or 2

  if (nsub > 2) {
    for (int i = 0; i < nedge; i++, m+=2)
-      fmt::print(fp,"{} {}\n",static_cast<int> (buf[m]),static_cast<int> (buf[m+1]));
+      utils::print(fp,"{} {}\n",static_cast<int> (buf[m]),static_cast<int> (buf[m+1]));
    for (int i = 0; i < nface; i++, m+=4)
-      fmt::print(fp,"{} {} {} {}\n",
+      utils::print(fp,"{} {} {} {}\n",
                 static_cast<int> (buf[m]),static_cast<int> (buf[m+1]),
                 static_cast<int> (buf[m+2]),static_cast<int> (buf[m+3]));
  }
@ -512,7 +512,7 @@ int BodyRoundedPolyhedron::write_data_body(FILE *fp, double *buf)
  // rounded diameter

  double diameter = buf[m++];
-  fmt::print(fp,"{}\n",diameter);
+  utils::print(fp,"{}\n",diameter);

  return m;
 }
--- a/src/DPD-REACT/fix_rx.cpp
+++ b/src/DPD-REACT/fix_rx.cpp
@ -307,9 +307,16 @@ void FixRX::post_constructor()
  id_fix_species = utils::strdup(std::string(id)+"_SPECIES");
  id_fix_species_old = utils::strdup(std::string(id)+"_SPECIES_OLD");

-  const std::string fmtstr = "{} {} property/atom ";
-  auto newcmd1 = fmt::format(fmtstr,id_fix_species,group->names[igroup]);
-  auto newcmd2 = fmt::format(fmtstr,id_fix_species_old,group->names[igroup]);
+  std::string newcmd1 = id_fix_species;
+  newcmd1 += " ";
+  newcmd1 += group->names[igroup];
+  newcmd1 += " property/atom ";
+
+  std::string newcmd2 = id_fix_species_old;
+  newcmd2 += " ";
+  newcmd2 += group->names[igroup];
+  newcmd2 += " property/atom ";
+
  for (int ii=0; ii<nspecies; ii++) {
    newcmd1 += fmt::format(" d_{}", tmpspecies[ii]);
    newcmd2 += fmt::format(" d_{}Old", tmpspecies[ii]);
--- a/src/DRUDE/compute_temp_drude.cpp
+++ b/src/DRUDE/compute_temp_drude.cpp
@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
@ -29,10 +28,9 @@ using namespace LAMMPS_NS;

 /* ---------------------------------------------------------------------- */

-ComputeTempDrude::ComputeTempDrude(LAMMPS *lmp, int narg, char **arg) :
-  Compute(lmp, narg, arg)
+ComputeTempDrude::ComputeTempDrude(LAMMPS *lmp, int narg, char **arg) : Compute(lmp, narg, arg)
 {
-  if (narg != 3) error->all(FLERR,"Illegal compute temp command");
+  if (narg != 3) error->all(FLERR, "Incorrect number of arguments for compute temp/drude");

  vector_flag = 1;
  scalar_flag = 1;
@ -65,11 +63,13 @@ void ComputeTempDrude::init()
 {
  // Fix drude already checks that there is only one fix drude instance
  auto &fixes = modify->get_fix_by_style("^drude$");
-  if (fixes.size() == 0) error->all(FLERR, "compute temp/drude requires fix drude");
+  if (fixes.size() == 0)
+    error->all(FLERR, Error::NOLASTLINE, "compute temp/drude requires fix drude");
  fix_drude = dynamic_cast<FixDrude *>(fixes[0]);

  if (!comm->ghost_velocity)
-    error->all(FLERR,"compute temp/drude requires ghost velocities. Use comm_modify vel yes");
+    error->all(FLERR, Error::NOLASTLINE,
+               "compute temp/drude requires ghost velocities. Use comm_modify vel yes");
 }

 /* ---------------------------------------------------------------------- */
@ -113,7 +113,7 @@ void ComputeTempDrude::dof_compute()
 int ComputeTempDrude::modify_param(int narg, char **arg)
 {
  if (strcmp(arg[0], "temp") == 0) {
-    if (narg < 2) error->all(FLERR,"Illegal fix_modify command");
+    if (narg < 2) utils::missing_cmd_args(FLERR, "compute_modify temp", error);
    delete[] id_temp;
    id_temp = utils::strdup(arg[1]);

@ -149,20 +149,32 @@ void ComputeTempDrude::compute_vector()
  double mcore, mdrude;
  double ecore, edrude;
  double *vcore, *vdrude;
-    double kineng_core_loc = 0., kineng_drude_loc = 0.;
+  double kineng_core_loc = 0.0, kineng_drude_loc = 0.0;
  for (int i = 0; i < nlocal; i++) {
+    vdrude = nullptr;
+    vcore = nullptr;
    if (groupbit & mask[i] && drudetype[type[i]] != DRUDE_TYPE) {
      if (drudetype[type[i]] == NOPOL_TYPE) {
-                ecore = 0.;
+        ecore = 0.0;
        vcore = v[i];
        if (temperature) temperature->remove_bias(i, vcore);
        for (int k = 0; k < dim; k++) ecore += vcore[k] * vcore[k];
        if (temperature) temperature->restore_bias(i, vcore);
-                if (rmass) mcore = rmass[i];
-                else mcore = mass[type[i]];
+        if (rmass)
+          mcore = rmass[i];
+        else
+          mcore = mass[type[i]];
        kineng_core_loc += mcore * ecore;
      } else {    // CORE_TYPE
        int j = atom->map(drudeid[i]);
+        if (j < 0) {
+          if (drudeid[i] == 0) {
+            error->one(FLERR, "Drude atom for core atom ID {} is not defined", atom->tag[i]);
+          } else {
+            error->one(FLERR, "Drude atom ID {} for core atom ID {} is out of range", drudeid[i],
+                       atom->tag[i]);
+          }
+        }
        if (rmass) {
          mcore = rmass[i];
          mdrude = rmass[j];
@ -170,9 +182,9 @@ void ComputeTempDrude::compute_vector()
          mcore = mass[type[i]];
          mdrude = mass[type[j]];
        }
-                double mtot_inv = 1. / (mcore + mdrude);
-                ecore = 0.;
-                edrude = 0.;
+        double mtot_inv = 1.0 / (mcore + mdrude);
+        ecore = 0.0;
+        edrude = 0.0;
        vcore = v[i];
        vdrude = v[j];
        if (temperature) {
@ -208,9 +220,9 @@ void ComputeTempDrude::compute_vector()
  vector[5] = kineng_drude;
 }

-double ComputeTempDrude::compute_scalar() {
+double ComputeTempDrude::compute_scalar()
+{
  compute_vector();
  scalar = vector[0];
  return scalar;
 }
-
--- a/src/DRUDE/compute_temp_drude.h
+++ b/src/DRUDE/compute_temp_drude.h
@ -32,7 +32,7 @@ class ComputeTempDrude : public Compute {
  void setup() override;
  void compute_vector() override;
  double compute_scalar() override;
-  int modify_param(int, char **);
+  int modify_param(int, char **) override;

 private:
  class FixDrude *fix_drude;
--- a/src/DRUDE/fix_drude.cpp
+++ b/src/DRUDE/fix_drude.cpp
@ -33,7 +33,9 @@ using namespace FixConst;
 FixDrude::FixDrude(LAMMPS *lmp, int narg, char **arg) :
  Fix(lmp, narg, arg)
 {
-  if (narg != 3 + atom->ntypes) error->all(FLERR,"Illegal fix drude command");
+  if (narg != 3 + atom->ntypes)
+    error->all(FLERR,"Incorrect number of arguments ({} instead of {}) for fix drude command",
+               narg, 3 + atom->ntypes);

  comm_border = 1; // drudeid
  special_alter_flag = 1;
@ -49,7 +51,7 @@ FixDrude::FixDrude(LAMMPS *lmp, int narg, char **arg) :
      else if (arg[i][0] == 'd' || arg[i][0] == 'D' || arg[i][0] == '2')
          drudetype[i-2] = DRUDE_TYPE;
      else
-          error->all(FLERR, "Illegal fix drude command");
+        error->all(FLERR, i, "Unknown drude type {} for atom type {}", arg[i], i-2);
  }

  drudeid = nullptr;
@ -82,7 +84,8 @@ FixDrude::~FixDrude()

 void FixDrude::init()
 {
-  if (modify->get_fix_by_style("^drude$").size() > 1) error->all(FLERR,"More than one fix drude");
+  if (modify->get_fix_by_style("^drude$").size() > 1)
+    error->all(FLERR, Error::NOLASTLINE, "More than one fix drude");

  if (!rebuildflag) rebuild_special();
 }
@ -106,8 +109,7 @@ void FixDrude::build_drudeid() {
  std::vector<tagint> core_drude_vec;
  partner_set = new std::set<tagint>[nlocal]; // Temporary sets of bond partner tags

-  if (atom->molecular == Atom::MOLECULAR)
-  {
+  if (atom->molecular == Atom::MOLECULAR) {
    // Build list of my atoms' bond partners
    for (int i=0; i<nlocal; i++) {
      if (drudetype[type[i]] == NOPOL_TYPE) continue;
@ -117,9 +119,7 @@ void FixDrude::build_drudeid() {
        core_drude_vec.push_back(atom->bond_atom[i][k]);
      }
    }
-  }
-  else
-  {
+  } else {
    // Template case
    class Molecule **atommols;
    atommols = atom->avec->onemols;
@ -157,10 +157,17 @@ void FixDrude::build_drudeid() {
  // At this point each of my Drudes knows its core.
  // Send my list of Drudes to other procs and myself
  // so that each core finds its Drude.
-  comm->ring(drude_vec.size(), sizeof(tagint),
-             (char *) drude_vec.data(),
+  comm->ring(drude_vec.size(), sizeof(tagint), (char *) drude_vec.data(),
             3, ring_search_drudeid, nullptr, (void *)this, 1);
  delete[] partner_set;
+
+  // Check if all cores have a drude particle attached
+  for (int i=0; i<nlocal; i++) {
+    if (drudetype[type[i]] == CORE_TYPE) {
+      if (drudeid[i] == 0)
+        error->one(FLERR, Error::NOLASTLINE, "Core atom ID {} has no drude atom", atom->tag[i]);
+    }
+  }
 }

 /* ----------------------------------------------------------------------
@ -347,7 +354,8 @@ void FixDrude::rebuild_special() {
    utils::logmesg(lmp, "New max number of 1-2 to 1-4 neighbors: {} (+{})\n", nspecmax, nspecmax - nspecmax_old);

  if (atom->maxspecial < nspecmax)
-    error->all(FLERR, "Not enough space in special: extra/special/per/atom should be at least {}", nspecmax - nspecmax_old);
+    error->all(FLERR, Error::NOLASTLINE, "Not enough space for special neighbors list: "
+               "use extra/special/per/atom with at least a value of {}", nspecmax - nspecmax_old);

  // Build list of cores' special lists to communicate to ghost drude particles
  for (int i=0; i<nlocal; i++) {
@ -512,7 +520,8 @@ void FixDrude::ring_copy_drude(int size, char *cbuf, void *ptr) {
 void FixDrude::set_arrays(int i) {
    if (drudetype[atom->type[i]] != NOPOL_TYPE) {
        if (atom->nspecial[i] == nullptr)
-          error->all(FLERR, "Polarizable atoms cannot be inserted with special lists info from the molecule template");
+          error->all(FLERR, Error::NOLASTLINE, "Polarizable atoms cannot be inserted "
+                     "with special lists info from the molecule template");
        drudeid[i] = atom->special[i][0]; // Drude partner should be at first place in the special list
    } else {
        drudeid[i] = 0;
--- a/src/ELECTRODE/fix_electrode_conp.cpp
+++ b/src/ELECTRODE/fix_electrode_conp.cpp
@ -1363,10 +1363,10 @@ int FixElectrodeConp::setmask()
 void FixElectrodeConp::write_to_file(FILE *file, const std::vector<tagint> &tags,
                                     const std::vector<std::vector<double>> &mat)
 {
-  for (const auto &t : tags) fmt::print(file, "{:20}", t);
+  for (const auto &t : tags) utils::print(file, "{:20}", t);
  fputs("\n", file);
  for (const auto &vec : mat) {
-    for (const auto &x : vec) fmt::print(file, "{:20.11e}", x);
+    for (const auto &x : vec) utils::print(file, "{:20.11e}", x);
    fputs("\n", file);
  }
 }
--- a/src/EXTRA-COMMAND/group2ndx.cpp
+++ b/src/EXTRA-COMMAND/group2ndx.cpp
@ -93,7 +93,7 @@ void Group2Ndx::write_group(FILE *fp, int gid)
    if (gid == 0) {
      fputs("[ System ]\n", fp);
    } else {
-      fmt::print(fp, "[ {} ]\n", group->names[gid]);
+      utils::print(fp, "[ {} ]\n", group->names[gid]);
    }
    width = log10((double) atom->natoms) + 2;
    cols = 80 / width;
@ -142,7 +142,7 @@ void Group2Ndx::write_group(FILE *fp, int gid)
  if (fp) {
    int i, j;
    for (i = 0, j = 0; i < gcount; ++i) {
-      fmt::print(fp, "{:>{}}", recvlist[i], width);
+      utils::print(fp, "{:>{}}", recvlist[i], width);
      ++j;
      if (j == cols) {
        fputs("\n", fp);
--- a/src/EXTRA-COMPUTE/compute_adf.cpp
+++ b/src/EXTRA-COMPUTE/compute_adf.cpp
@ -39,8 +39,6 @@ using MathConst::RAD2DEG;

 enum { DEGREE, RADIAN, COSINE };

-static constexpr double BIG = 1.0e20;
-
 /* ----------------------------------------------------------------------
   compute angular distribution functions for I, J, K atoms
 ---------------------------------------------------------------------- */
--- a/src/EXTRA-COMPUTE/compute_stress_cartesian.cpp
+++ b/src/EXTRA-COMPUTE/compute_stress_cartesian.cpp
@ -284,9 +284,9 @@ void ComputeStressCartesian::compute_array()
  // calculate number density and kinetic contribution to pressure
  if (compute_ke) {
    for (int i = 0; i < nlocal; i++) {
-      int bin1 = (int) ((x[i][dir1] - boxlo[dir1]) / bin_width1) % nbins1;
+      int bin1 = (int) floor((x[i][dir1] - boxlo[dir1]) / bin_width1) % nbins1;
      int bin2 = 0;
-      if (dims == 2) bin2 = (int) ((x[i][dir2] - boxlo[dir2]) / bin_width2) % nbins2;
+      if (dims == 2) bin2 = (int) floor((x[i][dir2] - boxlo[dir2]) / bin_width2) % nbins2;

      // Apply periodic boundary conditions and avoid out of range access
      if (domain->periodicity[dir1] == 1) {
@ -453,27 +453,6 @@ void ComputeStressCartesian::compute_pressure(double fpair, double xi, double yi
    int bin1 = next_bin1;
    int bin2 = next_bin2;

-    double l1;
-    if (rij1 > 0)
-      l1 = ((bin1 + 1) * bin_width1 - xi) / rij1;
-    else
-      l1 = (bin1 * bin_width1 - xi) / rij1;
-
-    double l2;
-    if (rij2 > 0)
-      l2 = ((bin2 + 1) * bin_width2 - yi) / rij2;
-    else
-      l2 = (bin2 * bin_width2 - yi) / rij2;
-
-    if ((l1 < l2 || l2 < lb + SMALL) && l1 <= 1.0 && l1 > lb) {
-      lb = l1;
-      next_bin1 = bin1 + (int) (rij1 / fabs(rij1));
-    } else if (l2 <= 1.0 && l2 > lb) {
-      lb = l2;
-      next_bin2 = bin2 + (int) (rij2 / fabs(rij2));
-    } else
-      lb = 1.0;
-
    // Periodic boundary conditions
    if (domain->periodicity[dir1] == 1) {
      if (bin1 < 0)
@ -495,6 +474,33 @@ void ComputeStressCartesian::compute_pressure(double fpair, double xi, double yi
    else if (bin2 >= nbins2)
      bin2 = nbins2 - 1;

+    double l1;
+    double tmp1[3] = {0.0, 0.0, 0.0};
+    if (rij1 > 0)
+      tmp1[dir1] = (bin1 + 1) * bin_width1 - xi;
+    else
+      tmp1[dir1] = bin1 * bin_width1 - xi;
+    domain->minimum_image(tmp1[0],tmp1[1],tmp1[2]);
+    l1 = tmp1[dir1] / rij1;
+
+    double l2;
+    double tmp2[3] = {0.0, 0.0, 0.0};
+    if (rij2 > 0)
+      tmp2[dir2] = (bin2 + 1) * bin_width2 - yi;
+    else
+      tmp2[dir2] = bin2 * bin_width2 - yi;
+    domain->minimum_image(tmp2[0],tmp2[1],tmp2[2]);
+    l2 = tmp2[dir2] / rij2;
+
+    if ((dims == 1 || l1 < l2 || l2 < lb + SMALL) && l1 <= 1.0 && l1 > lb) {
+      lb = l1;
+      next_bin1 = bin1 + (int) (rij1 / fabs(rij1));
+    } else if (dims == 2 && l2 <= 1.0 && l2 > lb) {
+      lb = l2;
+      next_bin2 = bin2 + (int) (rij2 / fabs(rij2));
+    } else
+      lb = 1.0;
+
    if (bin1 + bin2 * nbins1 > nbins1 * nbins2) error->all(FLERR, "Bin outside: lb={:.16g}", lb);

    tpcxx[bin1 + bin2 * nbins1] += (fpair * delx * delx * (lb - la));
--- a/src/EXTRA-DUMP/dump_yaml.cpp
+++ b/src/EXTRA-DUMP/dump_yaml.cpp
@ -94,31 +94,31 @@ void DumpYAML::write_header(bigint ndump)

  if (comm->me == 0) {
    const std::string boundary(boundstr);
-    fmt::print(fp, "---\ncreator: LAMMPS\ntimestep: {}\n", update->ntimestep);
-    if (unit_flag) fmt::print(fp, "units: {}\n", update->unit_style);
-    if (time_flag) fmt::print(fp, "time: {:.16g}\n", compute_time());
+    utils::print(fp, "---\ncreator: LAMMPS\ntimestep: {}\n", update->ntimestep);
+    if (unit_flag) utils::print(fp, "units: {}\n", update->unit_style);
+    if (time_flag) utils::print(fp, "time: {:.16g}\n", compute_time());

-    fmt::print(fp, "natoms: {}\n", ndump);
+    utils::print(fp, "natoms: {}\n", ndump);
    fputs("boundary: [ ", fp);
    for (const auto &bflag : boundary) {
      if (bflag == ' ') continue;
-      fmt::print(fp, "{}, ", bflag);
+      utils::print(fp, "{}, ", bflag);
    }
    fputs("]\n", fp);

-    if (thermo) fmt::print(fp, thermo_data);
+    if (thermo) utils::print(fp, thermo_data);

-    fmt::print(fp, "box:\n  - [ {}, {} ]\n", boxxlo, boxxhi);
-    fmt::print(fp, "  - [ {}, {} ]\n", boxylo, boxyhi);
-    fmt::print(fp, "  - [ {}, {} ]\n", boxzlo, boxzhi);
-    if (domain->triclinic) fmt::print(fp, "  - [ {}, {}, {} ]\n", boxxy, boxxz, boxyz);
+    utils::print(fp, "box:\n  - [ {}, {} ]\n", boxxlo, boxxhi);
+    utils::print(fp, "  - [ {}, {} ]\n", boxylo, boxyhi);
+    utils::print(fp, "  - [ {}, {} ]\n", boxzlo, boxzhi);
+    if (domain->triclinic) utils::print(fp, "  - [ {}, {}, {} ]\n", boxxy, boxxz, boxyz);

-    fmt::print(fp, "keywords: [ ");
+    utils::print(fp, "keywords: [ ");
    for (const auto &item : utils::split_words(columns)) {
      if (item.find_first_of(special_chars) == std::string::npos)
-        fmt::print(fp, "{}, ", item);
+        utils::print(fp, "{}, ", item);
      else
-        fmt::print(fp, "'{}', ", item);
+        utils::print(fp, "'{}', ", item);
    }
    fputs(" ]\ndata:\n", fp);
  } else    // reset so that the remainder of the output is not multi-proc
--- a/src/EXTRA-FIX/fix_ave_correlate_long.cpp
+++ b/src/EXTRA-FIX/fix_ave_correlate_long.cpp
@ -489,7 +489,7 @@ void FixAveCorrelateLong::end_of_step()
  if (fp && comm->me == 0) {
    clearerr(fp);
    if (overwrite) (void) platform::fseek(fp,filepos);
-    fmt::print(fp,"# Timestep: {}\n", ntimestep);
+    utils::print(fp,"# Timestep: {}\n", ntimestep);
    for (unsigned int i=0; i < npcorr; ++i) {
      fprintf(fp, "%lg ", t[i]*update->dt*nevery);
      for (int j=0; j < npair; ++j) {
--- a/src/EXTRA-FIX/fix_electron_stopping_fit.cpp
+++ b/src/EXTRA-FIX/fix_electron_stopping_fit.cpp
@ -68,7 +68,8 @@ FixElectronStoppingFit::FixElectronStoppingFit(LAMMPS *lmp, int narg, char **arg
     error->all(FLERR,"Incorrect number of fix electron/stopping/fit arguments");
  }

-  scalar_flag = 1;
+  scalar_flag = 1; // intensive total energy loss since start of run
+  extscalar = 0;
  global_freq = 1;

  energy_coh_in = new double[atom->ntypes+1];
--- a/src/EXTRA-FIX/fix_tmd.cpp
+++ b/src/EXTRA-FIX/fix_tmd.cpp
@ -270,7 +270,7 @@ void FixTMD::initial_integrate(int /*vflag*/)
    work_lambda += lambda*(rho_target - rho_old);
    if (!(update->ntimestep % nfileevery) &&
        (previous_stat != update->ntimestep)) {
-      fmt::print(fp, "{} {} {} {} {} {} {} {}\n", update->ntimestep,rho_target,rho_old,
+      utils::print(fp, "{} {} {} {} {} {} {} {}\n", update->ntimestep,rho_target,rho_old,
                 gamma_back,gamma_forward,lambda,work_lambda,work_analytical);
      fflush(fp);
      previous_stat = update->ntimestep;
--- a/src/EXTRA-FIX/fix_ttm.cpp
+++ b/src/EXTRA-FIX/fix_ttm.cpp
@ -523,7 +523,7 @@ void FixTTM::write_electron_temperatures(const std::string &filename)
  FILE *fp = fopen(filename.c_str(),"w");
  if (!fp) error->one(FLERR,"Fix ttm could not open output file {}: {}",
                      filename,utils::getsyserror());
-  fmt::print(fp,"# DATE: {} UNITS: {} COMMENT: Electron temperature on "
+  utils::print(fp,"# DATE: {} UNITS: {} COMMENT: Electron temperature on "
             "{}x{}x{} grid at step {} - created by fix {}\n", utils::current_date(),
             update->unit_style, nxgrid, nygrid, nzgrid, update->ntimestep, style);

--- a/src/EXTRA-FIX/fix_ttm_grid.cpp
+++ b/src/EXTRA-FIX/fix_ttm_grid.cpp
@ -411,7 +411,7 @@ void FixTTMGrid::write_restart_file(const char *file)
    if (fpout == nullptr)
      error->one(FLERR,"Cannot open fix ttm/grid restart file {}: {}",outfile,utils::getsyserror());

-    fmt::print(fpout,"# DATE: {} UNITS: {} COMMENT: "
+    utils::print(fpout,"# DATE: {} UNITS: {} COMMENT: "
               "Electron temperature on {}x{}x{} grid at step {} - "
               "created by fix {}\n",
               utils::current_date(),update->unit_style,
--- a/src/EXTRA-FIX/fix_ttm_mod.cpp
+++ b/src/EXTRA-FIX/fix_ttm_mod.cpp
@ -628,7 +628,7 @@ void FixTTMMod::write_electron_temperatures(const std::string &filename)
  FILE *fp = fopen(filename.c_str(),"w");
  if (!fp) error->one(FLERR,"Fix ttm/mod could not open output file {}: {}",
                      filename, utils::getsyserror());
-  fmt::print(fp,"# DATE: {} UNITS: {} COMMENT: Electron temperature "
+  utils::print(fp,"# DATE: {} UNITS: {} COMMENT: Electron temperature "
             "{}x{}x{} grid at step {}. Created by fix {}\n", utils::current_date(),
             update->unit_style, nxgrid, nygrid, nzgrid, update->ntimestep, style);

--- a/src/EXTRA-MOLECULE/pair_hbond_dreiding_lj_angleoffset.cpp
+++ b/src/EXTRA-MOLECULE/pair_hbond_dreiding_lj_angleoffset.cpp
@ -0,0 +1,131 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Tod A Pascal (Caltech), Don Xu/EiPi Fun
+------------------------------------------------------------------------- */
+
+#include "pair_hbond_dreiding_lj_angleoffset.h"
+
+#include "atom.h"
+#include "atom_vec.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "math_const.h"
+#include "math_special.h"
+#include "memory.h"
+#include "molecule.h"
+#include "neigh_list.h"
+#include "neighbor.h"
+
+#include <cmath>
+#include <cstring>
+
+static constexpr double SMALL = 0.001;
+static constexpr int CHUNK = 8;
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+// using namespace MathSpecial;
+
+/* ---------------------------------------------------------------------- */
+
+PairHbondDreidingLJAngleoffset::PairHbondDreidingLJAngleoffset(LAMMPS *lmp)
+    : PairHbondDreidingLJ(lmp) {
+
+  angle_offset_flag = 1;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+void PairHbondDreidingLJAngleoffset::coeff(int narg, char **arg)
+{
+  if (narg < 6 || narg > 11)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+  if (!allocated) allocate();
+
+  int ilo,ihi,jlo,jhi,klo,khi;
+  utils::bounds(FLERR, arg[0], 1, atom->ntypes, ilo, ihi, error);
+  utils::bounds(FLERR, arg[1], 1, atom->ntypes, jlo, jhi, error);
+  utils::bounds_typelabel(FLERR, arg[2], 1, atom->ntypes, klo, khi, lmp, Atom::ATOM);
+
+  int donor_flag;
+  if (strcmp(arg[3],"i") == 0) donor_flag = 0;
+  else if (strcmp(arg[3],"j") == 0) donor_flag = 1;
+  else error->all(FLERR,"Incorrect args for pair coefficients");
+
+  double epsilon_one = utils::numeric(FLERR, arg[4], false, lmp);
+  double sigma_one = utils::numeric(FLERR, arg[5], false, lmp);
+
+  int ap_one = ap_global;
+  if (narg > 6) ap_one = utils::inumeric(FLERR, arg[6], false, lmp);
+  double cut_inner_one = cut_inner_global;
+  double cut_outer_one = cut_outer_global;
+  if (narg > 8) {
+    cut_inner_one = utils::numeric(FLERR, arg[7], false, lmp);
+    cut_outer_one = utils::numeric(FLERR, arg[8], false, lmp);
+  }
+  if (cut_inner_one>cut_outer_one)
+    error->all(FLERR,"Pair inner cutoff >= Pair outer cutoff");
+  double cut_angle_one = cut_angle_global;
+  if (narg > 9) cut_angle_one = utils::numeric(FLERR, arg[9], false, lmp) * MY_PI/180.0;
+  double angle_offset_one = angle_offset_global;
+  if (narg == 11) angle_offset_one = (180.0 - utils::numeric(FLERR, arg[10], false, lmp)) * MY_PI/180.0;
+  if (angle_offset_one < 0.0 || angle_offset_one > 90.0 * MY_PI/180.0)
+    error->all(FLERR,"Illegal angle offset");
+
+  // grow params array if necessary
+
+  if (nparams == maxparam) {
+    maxparam += CHUNK;
+    params = (Param *) memory->srealloc(params, maxparam*sizeof(Param),
+                                        "pair:params");
+
+    // make certain all addional allocated storage is initialized
+    // to avoid false positives when checking with valgrind
+
+    memset(params + nparams, 0, CHUNK*sizeof(Param));
+  }
+
+  params[nparams].epsilon = epsilon_one;
+  params[nparams].sigma = sigma_one;
+  params[nparams].ap = ap_one;
+  params[nparams].cut_inner = cut_inner_one;
+  params[nparams].cut_outer = cut_outer_one;
+  params[nparams].cut_innersq = cut_inner_one*cut_inner_one;
+  params[nparams].cut_outersq = cut_outer_one*cut_outer_one;
+  params[nparams].cut_angle = cut_angle_one;
+  params[nparams].angle_offset = angle_offset_one;
+  params[nparams].denom_vdw =
+    (params[nparams].cut_outersq-params[nparams].cut_innersq) *
+    (params[nparams].cut_outersq-params[nparams].cut_innersq) *
+    (params[nparams].cut_outersq-params[nparams].cut_innersq);
+
+  // flag type2param with either i,j = D,A or j,i = D,A
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++)
+    for (int j = MAX(jlo,i); j <= jhi; j++)
+      for (int k = klo; k <= khi; k++) {
+        if (donor_flag == 0) type2param[i][j][k] = nparams;
+        else type2param[j][i][k] = nparams;
+        count++;
+      }
+  nparams++;
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
--- a/src/EXTRA-MOLECULE/pair_hbond_dreiding_lj_angleoffset.h
+++ b/src/EXTRA-MOLECULE/pair_hbond_dreiding_lj_angleoffset.h
@ -0,0 +1,38 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(hbond/dreiding/lj/angleoffset,PairHbondDreidingLJAngleoffset);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_HBOND_DREIDING_LJ_ANGLEOFFSET_H
+#define LMP_PAIR_HBOND_DREIDING_LJ_ANGLEOFFSET_H
+
+#include "pair_hbond_dreiding_lj.h"
+
+namespace LAMMPS_NS {
+
+class PairHbondDreidingLJAngleoffset : public PairHbondDreidingLJ {
+
+ public:
+  PairHbondDreidingLJAngleoffset(class LAMMPS *);
+  void coeff(int, char **) override;
+
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
--- a/src/EXTRA-MOLECULE/pair_hbond_dreiding_morse_angleoffset.cpp
+++ b/src/EXTRA-MOLECULE/pair_hbond_dreiding_morse_angleoffset.cpp
@ -0,0 +1,129 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+   ------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Tod A Pascal (Caltech), Don Xu/EiPi Fun
+------------------------------------------------------------------------- */
+
+#include "pair_hbond_dreiding_morse_angleoffset.h"
+
+#include "atom.h"
+#include "atom_vec.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "math_const.h"
+#include "math_special.h"
+#include "memory.h"
+#include "molecule.h"
+#include "neigh_list.h"
+#include "neighbor.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+static constexpr int CHUNK = 8;
+
+/* ---------------------------------------------------------------------- */
+
+PairHbondDreidingMorseAngleoffset::PairHbondDreidingMorseAngleoffset(LAMMPS *lmp) :
+  PairHbondDreidingMorse(lmp) {
+
+  angle_offset_flag = 1;
+}
+
+/* ----------------------------------------------------------------------
+ *    set coeffs for one or more type pairs
+ * ---------------------------------------------------------------------- */
+
+void PairHbondDreidingMorseAngleoffset::coeff(int narg, char **arg)
+{
+  if (narg < 7 || narg > 12)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+  if (!allocated) allocate();
+
+  int ilo,ihi,jlo,jhi,klo,khi;
+  utils::bounds(FLERR, arg[0], 1, atom->ntypes, ilo, ihi, error);
+  utils::bounds(FLERR, arg[1], 1, atom->ntypes, jlo, jhi, error);
+  utils::bounds_typelabel(FLERR, arg[2], 1, atom->ntypes, klo, khi, lmp, Atom::ATOM);
+
+  int donor_flag;
+  if (strcmp(arg[3],"i") == 0) donor_flag = 0;
+  else if (strcmp(arg[3],"j") == 0) donor_flag = 1;
+  else error->all(FLERR,"Incorrect args for pair coefficients");
+
+  double d0_one = utils::numeric(FLERR, arg[4], false, lmp);
+  double alpha_one = utils::numeric(FLERR, arg[5], false, lmp);
+  double r0_one = utils::numeric(FLERR, arg[6], false, lmp);
+
+  int ap_one = ap_global;
+  if (narg > 7) ap_one = utils::inumeric(FLERR, arg[7], false, lmp);
+  double cut_inner_one = cut_inner_global;
+  double cut_outer_one = cut_outer_global;
+  if (narg > 9) {
+    cut_inner_one = utils::numeric(FLERR, arg[8], false, lmp);
+    cut_outer_one = utils::numeric(FLERR, arg[9], false, lmp);
+  }
+  if (cut_inner_one>cut_outer_one)
+    error->all(FLERR,"Pair inner cutoff >= Pair outer cutoff");
+  double cut_angle_one = cut_angle_global;
+  if (narg > 10) cut_angle_one = utils::numeric(FLERR, arg[10], false, lmp) * MY_PI/180.0;
+  double angle_offset_one = angle_offset_global;
+  if (narg == 12) angle_offset_one = (180.0 - utils::numeric(FLERR, arg[11], false, lmp)) * MY_PI/180.0;
+  if (angle_offset_one < 0.0 || angle_offset_one > 90.0 * MY_PI/180.0)
+    error->all(FLERR,"Illegal angle offset {}", angle_offset_one);
+
+  // grow params array if necessary
+
+  if (nparams == maxparam) {
+    maxparam += CHUNK;
+    params = (Param *) memory->srealloc(params, maxparam*sizeof(Param),"pair:params");
+
+    // make certain all addional allocated storage is initialized
+    // to avoid false positives when checking with valgrind
+
+    memset(params + nparams, 0, CHUNK*sizeof(Param));
+  }
+
+  params[nparams].d0 = d0_one;
+  params[nparams].alpha = alpha_one;
+  params[nparams].r0 = r0_one;
+  params[nparams].ap = ap_one;
+  params[nparams].cut_inner = cut_inner_one;
+  params[nparams].cut_outer = cut_outer_one;
+  params[nparams].cut_innersq = cut_inner_one*cut_inner_one;
+  params[nparams].cut_outersq = cut_outer_one*cut_outer_one;
+  params[nparams].cut_angle = cut_angle_one;
+  params[nparams].angle_offset = angle_offset_one;
+  params[nparams].denom_vdw = (params[nparams].cut_outersq-params[nparams].cut_innersq) *
+    (params[nparams].cut_outersq-params[nparams].cut_innersq) *
+    (params[nparams].cut_outersq-params[nparams].cut_innersq);
+
+  // flag type2param with either i,j = D,A or j,i = D,A
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++)
+    for (int j = MAX(jlo,i); j <= jhi; j++)
+      for (int k = klo; k <= khi; k++) {
+        if (donor_flag == 0) type2param[i][j][k] = nparams;
+        else type2param[j][i][k] = nparams;
+        count++;
+      }
+  nparams++;
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
--- a/src/EXTRA-MOLECULE/pair_hbond_dreiding_morse_angleoffset.h
+++ b/src/EXTRA-MOLECULE/pair_hbond_dreiding_morse_angleoffset.h
@ -0,0 +1,38 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(hbond/dreiding/morse/angleoffset,PairHbondDreidingMorseAngleoffset);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_HBOND_DREIDING_MORSE_ANGLEOFFSET_H
+#define LMP_PAIR_HBOND_DREIDING_MORSE_ANGLEOFFSET_H
+
+#include "pair_hbond_dreiding_morse.h"
+
+namespace LAMMPS_NS {
+
+class PairHbondDreidingMorseAngleoffset : public PairHbondDreidingMorse {
+
+ public:
+  PairHbondDreidingMorseAngleoffset(class LAMMPS *);
+  void coeff(int, char **) override;
+
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
--- a/src/GRANULAR/fix_granular_mdr.cpp
+++ b/src/GRANULAR/fix_granular_mdr.cpp
@ -0,0 +1,702 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors:
+   William Zunker (MIT), Sachith Dunatunga (MIT),
+   Dan Bolintineanu (SNL), Joel Clemmer (SNL)
+----------------------------------------------------------------------- */
+
+#include "fix_granular_mdr.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "fix_wall_gran_region.h"
+#include "fix_neigh_history.h"
+#include "force.h"
+#include "granular_model.h"
+#include "gran_sub_mod_normal.h"
+#include "input.h"
+#include "math_const.h"
+#include "memory.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "pair.h"
+#include "pair_granular.h"
+#include "region.h"
+#include "update.h"
+#include "variable.h"
+
+using namespace LAMMPS_NS;
+using namespace Granular_NS;
+using namespace Granular_MDR_NS;
+using namespace FixConst;
+using MathConst::MY_PI;
+
+static constexpr double EPSILON = 1e-16;
+static constexpr double OVERLAP_LIMIT = 0.75;
+
+enum {COMM_1, COMM_2};
+
+/* ---------------------------------------------------------------------- */
+
+FixGranularMDR::FixGranularMDR(LAMMPS *lmp, int narg, char **arg) :
+    Fix(lmp, narg, arg)
+{
+  comm_forward = 5;
+  create_attribute = 1;
+
+  id_fix = nullptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixGranularMDR::~FixGranularMDR()
+{
+  if (id_fix && modify->nfix)
+    modify->delete_fix(id_fix);
+  delete[] id_fix;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixGranularMDR::setmask()
+{
+  int mask = 0;
+  mask |= PRE_FORCE;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixGranularMDR::post_constructor()
+{
+  int tmp1, tmp2;
+  id_fix = utils::strdup("MDR_PARTICLE_HISTORY_VARIABLES");
+  modify->add_fix(fmt::format("{} all property/atom d_Ro d_Vcaps d_Vgeo d_Velas d_eps_bar d_dRnumerator d_dRdenominator d_Acon0 d_Acon1 d_Atot d_Atot_sum d_ddelta_bar d_psi d_history_setup_flag d_sigmaxx d_sigmayy d_sigmazz ghost yes", id_fix));
+
+  index_Ro = atom->find_custom("Ro", tmp1, tmp2);
+  index_Vcaps = atom->find_custom("Vcaps", tmp1, tmp2);
+  index_Vgeo = atom->find_custom("Vgeo", tmp1, tmp2);
+  index_Velas = atom->find_custom("Velas", tmp1, tmp2);
+  index_eps_bar = atom->find_custom("eps_bar", tmp1, tmp2);
+  index_dRnumerator = atom->find_custom("dRnumerator", tmp1, tmp2);
+  index_dRdenominator = atom->find_custom("dRdenominator", tmp1, tmp2);
+  index_Acon0 = atom->find_custom("Acon0", tmp1, tmp2);
+  index_Acon1 = atom->find_custom("Acon1", tmp1, tmp2);
+  index_Atot = atom->find_custom("Atot", tmp1, tmp2);
+  index_Atot_sum = atom->find_custom("Atot_sum", tmp1, tmp2);
+  index_ddelta_bar = atom->find_custom("ddelta_bar", tmp1, tmp2);
+  index_psi = atom->find_custom("psi", tmp1, tmp2);
+  index_history_setup_flag = atom->find_custom("history_setup_flag", tmp1, tmp2);
+  index_sigmaxx = atom->find_custom("sigmaxx", tmp1, tmp2);
+  index_sigmayy = atom->find_custom("sigmayy", tmp1, tmp2);
+  index_sigmazz = atom->find_custom("sigmazz", tmp1, tmp2);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixGranularMDR::setup_pre_force(int /*vflag*/)
+{
+  pair = dynamic_cast<PairGranular *>(force->pair_match("granular", 1));
+  if (pair == nullptr)
+    error->all(FLERR, "Must use pair granular with MDR model");
+
+  if (force->newton)
+    error->all(FLERR, "MDR contact model requires Newton off");
+
+  // Confirm all MDR models are consistent
+
+  class GranularModel *pair_model, *fix_model;
+  class GranularModel **models_list = pair->models_list;
+  class GranSubModNormalMDR *norm_model = nullptr;
+  for (int i = 0; i < pair->nmodels; i++) {
+    pair_model = models_list[i];
+    if (pair_model->normal_model->name == "mdr") {
+      if (norm_model != nullptr)
+        error->all(FLERR, "Cannot currently define multiple MDR normal models in the pairstyle");
+      norm_model = dynamic_cast<GranSubModNormalMDR *>(pair_model->normal_model);
+    } else {
+      error->all(FLERR, "Cannot combine MDR normal model with a different normal model in the pairstyle");
+    }
+  }
+
+  if (norm_model == nullptr)
+    error->all(FLERR, "Must specify MDR normal model with pair granular");
+  psi_b_coeff = norm_model->psi_b;
+
+  fix_wall_list = modify->get_fix_by_style("wall/gran/region");
+  class GranSubModNormalMDR *norm_model2;
+  class FixWallGranRegion *fix;
+  for (int i = 0; i < fix_wall_list.size(); i++) {
+    if (!utils::strmatch(fix_wall_list[i]->style, "wall/gran/region"))
+      error->all(FLERR, "MDR model currently only supports fix wall/gran/region, not fix wall/gran");
+
+    fix = dynamic_cast<FixWallGranRegion*>(fix_wall_list[i]);
+    if (fix->model->normal_model->name != "mdr")
+      error->all(FLERR, "Fix wall/gran/region must use an MDR normal model when using an MDR pair model");
+
+    norm_model2 = dynamic_cast<GranSubModNormalMDR *>(fix->model->normal_model);
+
+    if (norm_model->E != norm_model2->E)
+      error->all(FLERR, "Young's modulus in pair style, {}, does not agree with value {} in fix gran/wall/region",
+        norm_model->E, norm_model2->E);
+    if (norm_model->nu != norm_model2->nu)
+      error->all(FLERR, "Poisson's ratio in pair style, {}, does not agree with value {} in fix gran/wall/region",
+        norm_model->nu, norm_model2->nu);
+    if (norm_model->Y != norm_model2->Y)
+      error->all(FLERR, "Yield stress in pair style, {}, does not agree with value {} in fix gran/wall/region",
+        norm_model->Y, norm_model2->Y);
+    if (norm_model->psi_b != norm_model2->psi_b)
+      error->all(FLERR, "Bulk response trigger in pair style, {}, does not agree with value {} in fix gran/wall/region",
+        norm_model->psi_b, norm_model2->psi_b);
+    if (norm_model->CoR != norm_model2->CoR)
+      error->all(FLERR, "Coefficient of restitution in pair style, {}, does not agree with value {} in fix gran/wall/region",
+        norm_model->CoR, norm_model2->CoR);
+  }
+
+  fix_history = dynamic_cast<FixNeighHistory *>(modify->get_fix_by_id("NEIGH_HISTORY_GRANULAR"));
+
+  pre_force(0);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixGranularMDR::pre_force(int)
+{
+  double *radius = atom->radius;
+  double *Ro = atom->dvector[index_Ro];
+  double *Vgeo = atom->dvector[index_Vgeo];
+  double *Velas = atom->dvector[index_Velas];
+  double *Vcaps = atom->dvector[index_Vcaps];
+  double *eps_bar = atom->dvector[index_eps_bar];
+  double *dRnumerator = atom->dvector[index_dRnumerator];
+  double *dRdenominator = atom->dvector[index_dRdenominator];
+  double *Acon0 = atom->dvector[index_Acon0];
+  double *Acon1 = atom->dvector[index_Acon1];
+  double *Atot = atom->dvector[index_Atot];
+  double *Atot_sum = atom->dvector[index_Atot_sum];
+  double *psi = atom->dvector[index_psi];
+  double *ddelta_bar = atom->dvector[index_ddelta_bar];
+  double *sigmaxx = atom->dvector[index_sigmaxx];
+  double *sigmayy = atom->dvector[index_sigmayy];
+  double *sigmazz = atom->dvector[index_sigmazz];
+  double *history_setup_flag = atom->dvector[index_history_setup_flag];
+
+  int new_atom;
+  int nlocal = atom->nlocal;
+  int ntotal = nlocal + atom->nghost;
+  for (int i = 0; i < ntotal; i++) {
+    // initialize new atoms
+    new_atom = 0;
+    if (history_setup_flag[i] < EPSILON) {
+      Ro[i] = radius[i];
+      Acon0[i] = 0.0;
+      Acon1[i] = 0.0;
+      Vcaps[i] = 0.0;
+      eps_bar[i] = 0.0;
+      dRnumerator[i] = 0.0;
+      dRdenominator[i] = 0.0;
+      Atot_sum[i] = 0.0;
+      ddelta_bar[i] = 0.0;
+      sigmaxx[i] = 0.0;
+      sigmayy[i] = 0.0;
+      sigmazz[i] = 0.0;
+      history_setup_flag[i] = 1.0;
+      new_atom = 1;
+    }
+
+    // update apparent radius
+
+    // will forward to ghosts
+    if (i >= nlocal) continue;
+
+    // only update outside of setup (unless a new atom)
+    if (update->setupflag && (!new_atom)) continue;
+
+    const double R = radius[i];
+    const double Vo = 4.0 / 3.0 * MY_PI * pow(Ro[i], 3.0);
+    const double Vgeoi = 4.0 / 3.0 * MY_PI * pow(R, 3.0) - Vcaps[i];
+
+    Vgeo[i] = MIN(Vgeoi, Vo);
+    Velas[i] = Vo * (1.0 + eps_bar[i]);
+    Atot[i] = 4.0 * MY_PI * pow(R, 2.0) + Atot_sum[i];
+    psi[i] = (Atot[i] - Acon1[i]) / Atot[i];
+
+    if (psi_b_coeff < psi[i]) {
+      const double dR = MAX(dRnumerator[i] / (dRdenominator[i] - 4.0 * MY_PI * pow(R, 2.0)), 0.0);
+      if ((radius[i] + dR) < (1.5 * Ro[i]))
+        radius[i] += dR;
+    }
+    Acon0[i] = Acon1[i];
+  }
+
+  comm_stage = COMM_1;
+  comm->forward_comm(this, 5);
+
+  // rezero temporary variables for all atoms, no need to communicate
+  for (int i = 0; i < ntotal; i++) {
+    ddelta_bar[i] = 0.0;
+    if (!update->setupflag) {
+      Vcaps[i] = 0.0;
+      eps_bar[i] = 0.0;
+      dRnumerator[i] = 0.0;
+      dRdenominator[i] = 0.0;
+      Acon1[i] = 0.0;
+      Atot_sum[i] = 0.0;
+      sigmaxx[i] = 0.0;
+      sigmayy[i] = 0.0;
+      sigmazz[i] = 0.0;
+    }
+  }
+  if (!update->setupflag) {
+    calculate_contact_penalty();
+    mean_surf_disp();
+    update_fix_gran_wall();
+  }
+
+  comm_stage = COMM_2;
+  comm->forward_comm(this, 1);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixGranularMDR::pack_forward_comm(int n, int *list, double *buf, int /*pbc_flag*/,int * /*pbc*/)
+{
+  double **dvector = atom->dvector;
+  int m = 0;
+  if (comm_stage == COMM_1) {
+    for (int i = 0; i < n; i++) {
+      int j = list[i];
+      buf[m++] = dvector[index_Vgeo][j];               // 2
+      buf[m++] = dvector[index_Velas][j];              // 3
+      buf[m++] = dvector[index_Acon0][j];              // 8
+      buf[m++] = dvector[index_Atot][j];               // 10
+      buf[m++] = dvector[index_psi][j];                // 13
+    }
+  } else {
+    for (int i = 0; i < n; i++) {
+      int j = list[i];
+      buf[m++] = dvector[index_ddelta_bar][j];
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixGranularMDR::unpack_forward_comm(int n, int first, double *buf)
+{
+  double **dvector = atom->dvector;
+  int m = 0;
+  int last = first + n;
+
+  if (comm_stage == COMM_1) {
+    for (int i = first; i < last; i++) {
+      dvector[index_Vgeo][i] = buf[m++];               // 2
+      dvector[index_Velas][i] = buf[m++];              // 3
+      dvector[index_Acon0][i] = buf[m++];              // 8
+      dvector[index_Atot][i] = buf[m++];               // 10
+      dvector[index_psi][i] = buf[m++];                // 13
+    }
+  } else {
+    for (int i = first; i < last; i++) {
+      dvector[index_ddelta_bar][i] = buf[m++];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   initialize setup flag to zero, called when atom is created
+------------------------------------------------------------------------- */
+
+void FixGranularMDR::set_arrays(int i)
+{
+  atom->dvector[index_history_setup_flag][i] = 0.0;
+}
+
+/* ----------------------------------------------------------------------
+  Screen for non-physical contacts occuring through obstructing particles.
+  Assign non-zero penalties to these contacts to adjust force evaluation.
+------------------------------------------------------------------------- */
+
+void FixGranularMDR::calculate_contact_penalty()
+{
+  NeighList * list = pair->list;
+  const int size_history = pair->get_size_history();
+
+  int i, j, k, lv1, ii, jj, inum, jnum;
+
+  int *ilist, *jlist, *numneigh, **firstneigh;
+  int *touch, **firsttouch;
+  double *history, *history_ij, *history_ik, *history_jk, *history_kj;
+  double *allhistory, *allhistory_j, *allhistory_k, **firsthistory;
+
+  bool touchflag = false;
+
+  double **x = atom->x;
+  double *radius = atom->radius;
+  int nlocal = atom->nlocal;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+  firsttouch = fix_history->firstflag;
+  firsthistory = fix_history->firstvalue;
+
+  // zero existing penalties
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    allhistory = firsthistory[i];
+    jnum = numneigh[i];
+    for (jj = 0; jj < jnum; jj++)
+      (&allhistory[size_history * jj])[PENALTY] = 0.0;
+  }
+
+  // contact penalty calculation
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    const double xtmp = x[i][0];
+    const double ytmp = x[i][1];
+    const double ztmp = x[i][2];
+    allhistory = firsthistory[i];
+    double radi = radius[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      double radj = radius[j];
+      const double delx_ij = x[j][0] - xtmp;
+      const double dely_ij = x[j][1] - ytmp;
+      const double delz_ij = x[j][2] - ztmp;
+      const double rsq_ij = delx_ij * delx_ij + dely_ij * dely_ij + delz_ij * delz_ij;
+      const double r_ij = sqrt(rsq_ij);
+      const double rinv_ij = 1.0 / r_ij;
+      const double radsum_ij = radi + radj;
+      const double deltan_ij = radsum_ij - r_ij;
+      if (deltan_ij < 0.0) continue;
+      for (int kk = jj + 1; kk < jnum; kk++) {
+        k = jlist[kk];
+        k &= NEIGHMASK;
+
+        const double delx_ik = x[k][0] - xtmp;
+        const double dely_ik = x[k][1] - ytmp;
+        const double delz_ik = x[k][2] - ztmp;
+        const double rsq_ik = delx_ik * delx_ik + dely_ik * dely_ik + delz_ik *delz_ik;
+        const double r_ik = sqrt(rsq_ik);
+        const double rinv_ik = 1.0 / r_ik;
+        const double radk = radius[k];
+        const double radsum_ik = radi + radk;
+        const double deltan_ik = radsum_ik - r_ik;
+
+        if (deltan_ik < 0.0) continue;
+
+        const double delx_jk = x[k][0] - x[j][0];
+        const double dely_jk = x[k][1] - x[j][1];
+        const double delz_jk = x[k][2] - x[j][2];
+        const double rsq_jk = delx_jk * delx_jk + dely_jk * dely_jk + delz_jk *delz_jk;
+        const double r_jk = sqrt(rsq_jk);
+        const double rinv_jk = 1.0 / r_jk;
+        const double radsum_jk = radj + radk;
+        const double deltan_jk = radsum_jk - r_jk;
+
+        if (deltan_jk < 0.0) continue;
+
+        // pull ij history
+        history_ij = &allhistory[size_history * jj];
+        double * pij = &history_ij[PENALTY]; // penalty for contact i and j
+
+        // pull ik history
+        history_ik = &allhistory[size_history * kk];
+        double * pik = &history_ik[PENALTY]; // penalty for contact i and k
+
+        // Find pair of atoms with the smallest overlap, atoms a & b, 3rd atom c is central
+        //   if a & b are both local:
+        //     calculate ab penalty and add to the pab[0] history entry
+        //   if a is local & b is ghost or vice versa:
+        //     each processor has a-b in nlist and independently calculates + adds penalty
+        //   if a & b are both ghosts:
+        //     skip calculation since it's performed on other proc
+        // This process requires newton off, or nlist may not include ab, ac, & bc
+
+        const double r_max = MAX(r_ij, MAX(r_ik, r_jk));
+        if (r_ij == r_max) { // the central particle is k
+          const double enx_ki = -delx_ik * rinv_ik;
+          const double eny_ki = -dely_ik * rinv_ik;
+          const double enz_ki = -delz_ik * rinv_ik;
+          const double enx_kj = -delx_jk * rinv_jk;
+          const double eny_kj = -dely_jk * rinv_jk;
+          const double enz_kj = -delz_jk * rinv_jk;
+          const double alpha = std::acos(enx_ki * enx_kj + eny_ki * eny_kj + enz_ki * enz_kj);
+          pij[0] += 1.0 / (1.0 + std::exp(-50.0 * (alpha / MY_PI - 0.5)));
+        } else if (r_ik == r_max) { // the central particle is j
+          const double enx_ji = -delx_ij * rinv_ij;
+          const double eny_ji = -dely_ij * rinv_ij;
+          const double enz_ji = -delz_ij * rinv_ij;
+          const double enx_jk = delx_jk * rinv_jk;
+          const double eny_jk = dely_jk * rinv_jk;
+          const double enz_jk = delz_jk * rinv_jk;
+          const double alpha = std::acos(enx_ji * enx_jk + eny_ji * eny_jk + enz_ji * enz_jk);
+          pik[0] += 1.0 / (1.0 + std::exp(-50.0 * (alpha / MY_PI - 0.5)));
+        } else { // the central particle is i
+          if (j < atom->nlocal || k < atom->nlocal) {
+            const double enx_ij = delx_ij * rinv_ij;
+            const double eny_ij = dely_ij * rinv_ij;
+            const double enz_ij = delz_ij * rinv_ij;
+            const double enx_ik = delx_ik * rinv_ik;
+            const double eny_ik = dely_ik * rinv_ik;
+            const double enz_ik = delz_ik * rinv_ik;
+            const double alpha = std::acos(enx_ij * enx_ik + eny_ij * eny_ik + enz_ij * enz_ik);
+
+            // don't know who owns the contact, k may be in j's nlist or vice versa
+            // need to search both to find owner
+            double * pjk = nullptr;
+            if (j < atom->nlocal) {
+              int * const jklist = firstneigh[j];
+              const int jknum = numneigh[j];
+              for (int jk = 0; jk < jknum; jk++) {
+                const int kneigh = jklist[jk] & NEIGHMASK;
+                if (k == kneigh) {
+                  allhistory_j = firsthistory[j];
+                  history_jk = &allhistory_j[size_history * jk];
+                  pjk = &history_jk[PENALTY]; // penalty for contact j and k
+                  break;
+                }
+              }
+            }
+
+            // check if j is in the neighbor list of k
+            if (pjk == nullptr && k < atom->nlocal) {
+              int * const kjlist = firstneigh[k];
+              const int kjnum = numneigh[k];
+              for (int kj = 0; kj < kjnum; kj++) {
+                const int jneigh = kjlist[kj] & NEIGHMASK;
+                if (j == jneigh) {
+                  allhistory_k = firsthistory[k];
+                  history_kj = &allhistory_k[size_history * kj];
+                  pjk = &history_kj[PENALTY]; // penalty for contact j and k
+                  break;
+                }
+              }
+            }
+
+            if (pjk == nullptr)
+              error->one(FLERR, "Contact between a pair of particles was detected by the MDR model, however it is not reflected in the neighbor lists. To solve this issue either build the neighbor lists more frequently or increase their size (e.g. increase the skin distance).");
+
+            pjk[0] += 1.0 / (1.0 + std::exp(-50.0 * (alpha / MY_PI - 0.5)));
+          }
+        }
+      }
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------
+   Calculate mean surface displacement increment for each particle
+------------------------------------------------------------------------- */
+
+void FixGranularMDR::mean_surf_disp()
+{
+  NeighList * list = pair->list;
+
+  const int size_history = pair->get_size_history();
+  int i, j, k, ii, jj, inum, jnum, itype, jtype;
+  int *ilist, *jlist, *numneigh, **firstneigh;
+  int *touch, **firsttouch;
+  double *history, *allhistory, **firsthistory;
+
+  bool touchflag = false;
+  class GranularModel* model;
+  class GranularModel** models_list = pair->models_list;
+  int ** types_indices = pair->types_indices;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  double *radius = atom->radius;
+  int nlocal = atom->nlocal;
+
+  double *Acon0 = atom->dvector[index_Acon0];
+  double *ddelta_bar = atom->dvector[index_ddelta_bar];
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+  firsttouch = fix_history->firstflag;
+  firsthistory = fix_history->firstvalue;
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    itype = type[i];
+    touch = firsttouch[i];
+    allhistory = firsthistory[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      jtype = type[j];
+      model = models_list[types_indices[itype][jtype]];
+
+      // Reset model and copy initial geometric data
+      model->xi = x[i];
+      model->xj = x[j];
+      model->radi = radius[i];
+      model->radj = radius[j];
+      model->i = i;
+      model->j = j;
+      model->touch = touch[jj];
+      touchflag = model->check_contact();
+
+      // is it necessary to clear the history here???
+      if (!touchflag) {
+        touch[jj] = 0;
+        history = &allhistory[size_history * jj];
+        for (k = 0; k < size_history; k++) history[k] = 0.0;
+        continue;
+      }
+
+      touch[jj] = 1;
+
+      history = &allhistory[size_history * jj];
+      model->history = history;
+
+      const double delta = model->radsum - sqrt(model->rsq);
+
+      double deltamax = history[DELTA_MAX];
+      double deltap0 = history[DELTAP_0];
+      double deltap1 = history[DELTAP_1];
+
+      if (delta > deltamax) deltamax = delta;
+
+      double delta0old = history[DELTA_0];
+      double delta1old = history[DELTA_1];
+
+      int i0;
+      int i1;
+      if (atom->tag[i] > atom->tag[j]) {
+        i0 = i;
+        i1 = j;
+      } else {
+        i0 = j;
+        i1 = i;
+      }
+
+      double R0 = radius[i0];
+      double R1 = radius[i1];
+
+      double delta_geo0;
+      double delta_geo1;
+      double deltaOpt1 = deltamax * (deltamax - 2.0 * R1) / (2.0 * (deltamax - R0 - R1));
+      double deltaOpt2 = deltamax * (deltamax - 2.0 * R0) / (2.0 * (deltamax - R0 - R1));
+      (R0 < R1) ? delta_geo0 = MAX(deltaOpt1, deltaOpt2) : delta_geo0 = MIN(deltaOpt1, deltaOpt2);
+      (R0 < R1) ? delta_geo1 = MIN(deltaOpt1, deltaOpt2) : delta_geo1 = MAX(deltaOpt1, deltaOpt2);
+
+      if (delta_geo0 / R0 > OVERLAP_LIMIT) {
+        delta_geo0 = R0 * OVERLAP_LIMIT;
+        delta_geo1 = deltamax - delta_geo0;
+      } else if (delta_geo1 / R1 > OVERLAP_LIMIT) {
+        delta_geo1 = R1 * OVERLAP_LIMIT;
+        delta_geo0 = deltamax - delta_geo1;
+      }
+
+      double deltap = deltap0 + deltap1;
+
+      double delta0 = delta_geo0 + (deltap0 - delta_geo0) / (deltap - deltamax) * (delta - deltamax);
+      double delta1 = delta_geo1 + (deltap1 - delta_geo1) / (deltap - deltamax) * (delta - deltamax);
+
+      double ddel0 = delta0 - delta0old;
+      double ddel1 = delta1 - delta1old;
+
+      if (Acon0[i0] != 0.0) {
+        const double Ac_offset0 = history[AC_0];
+        ddelta_bar[i0] += Ac_offset0 / Acon0[i0] * ddel0;
+      }
+
+      if (Acon0[i1] != 0.0) {
+        const double Ac_offset1 = history[AC_1];
+        ddelta_bar[i1] += Ac_offset1 / Acon0[i1] * ddel1;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Update instance of fix gran/wall
+------------------------------------------------------------------------- */
+
+void FixGranularMDR::update_fix_gran_wall()
+{
+  int i, m, nc, iwall;
+
+  double **x = atom->x;
+  double *radius = atom->radius;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  double *Acon0 = atom->dvector[index_Acon0];
+  double *ddelta_bar = atom->dvector[index_ddelta_bar];
+
+  for (int w = 0; w < fix_wall_list.size(); w++) {
+    FixWallGranRegion *fix = dynamic_cast<FixWallGranRegion*>(fix_wall_list[w]);
+    GranularModel *model = fix->model;
+    const int size_history = model->size_history;
+    Region *region = fix->region;
+
+    if (region->dynamic_check())
+      region->prematch();
+
+    for (i = 0; i < nlocal; i++) {
+      if (!(mask[i] & groupbit)) continue;
+      if (! region->match(x[i][0], x[i][1], x[i][2])) continue;
+
+      nc = region->surface(x[i][0], x[i][1], x[i][2], radius[i] + model->pulloff_distance(radius[i], 0.0));
+
+      if (nc == 0) {
+        fix->ncontact[i] = 0;
+        continue;
+      }
+      if (nc == 1) {
+        fix->c2r[0] = 0;
+        iwall = region->contact[0].iwall;
+        if (fix->ncontact[i] == 0) {
+          fix->ncontact[i] = 1;
+          fix->walls[i][0] = iwall;
+          for (m = 0; m < size_history; m++) fix->history_many[i][0]  [m] = 0.0;
+        } else if (fix->ncontact[i] > 1 || iwall != fix->walls[i][0])
+          fix->update_contacts(i, nc);
+      } else
+        fix->update_contacts(i, nc);
+
+      // process current contacts
+      for (int ic = 0; ic < nc; ic++) {
+        const double wij = 1.0;
+        if (Acon0[i] != 0.0) {
+          const double delta = radius[i] - region->contact[ic].r;
+          const double delta_offset0 = fix->history_many[i][fix->c2r[ic]][0];
+          const double ddelta = delta - delta_offset0;
+          const double Ac_offset0 = fix->history_many[i][fix->c2r[ic]][18];
+          ddelta_bar[i] += wij * Ac_offset0 / Acon0[i] * ddelta;
+        }
+      }
+    }
+  }
+}
--- a/src/GRANULAR/fix_granular_mdr.h
+++ b/src/GRANULAR/fix_granular_mdr.h
@ -0,0 +1,107 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+// clang-format off
+FixStyle(GRANULAR/MDR,FixGranularMDR);
+// clang-format on
+#else
+
+#ifndef LMP_FIX_GRANULAR_MDR_H
+#define LMP_FIX_GRANULAR_MDR_H
+
+#include "fix.h"
+
+namespace LAMMPS_NS {
+namespace Granular_MDR_NS {
+
+  enum HistoryIndex {
+    DELTA_0 = 0,       // apparent overlap
+    DELTA_1,
+    DELTAO_0,          // displacement
+    DELTAO_1,
+    DELTA_MDR_0,       // MDR apparent overlap
+    DELTA_MDR_1,
+    DELTA_BULK_0,      // bulk displacement
+    DELTA_BULK_1,
+    DELTAMAX_MDR_0,    // maximum MDR apparent overlap
+    DELTAMAX_MDR_1,
+    YFLAG_0,           // yield flag
+    YFLAG_1,
+    DELTAY_0,          // yield displacement
+    DELTAY_1,
+    CA_0,              // contact area intercept
+    CA_1,
+    AADH_0,            // adhesive contact radius
+    AADH_1,
+    AC_0,              // contact area
+    AC_1,
+    EPS_BAR_0,         // volume-averaged   infinitesimal sor
+    EPS_BAR_1,
+    PENALTY,           // contact penalty
+    DELTA_MAX,
+    DELTAP_0,
+    DELTAP_1
+  };
+
+}    // namespace Granular_MDR_NS
+
+class FixGranularMDR : public Fix {
+ public:
+  FixGranularMDR(class LAMMPS *, int, char **);
+  ~FixGranularMDR() override;
+  int setmask() override;
+  void post_constructor() override;
+  void setup_pre_force(int) override;
+  void pre_force(int) override;
+  int pack_forward_comm(int, int *, double *, int, int *) override;
+  void unpack_forward_comm(int, int, double *) override;
+  void set_arrays(int) override;
+
+ private:
+  int comm_stage;
+  char *id_fix;
+  double psi_b_coeff;
+  class PairGranular *pair;
+  class FixNeighHistory *fix_history;
+  std::vector<Fix *> fix_wall_list;
+
+  void mean_surf_disp();
+  void calculate_contact_penalty();
+  void update_fix_gran_wall();
+
+  int index_Ro;                 // initial radius
+  int index_Vgeo;               // geometric particle volume of apparent particle afterremoving spherical cap volume
+  int index_Velas;              // particle volume from linear elasticity
+  int index_Vcaps;              // spherical cap volume from intersection of apparentradius particle and contact planes
+  int index_eps_bar;            // volume-averaged infinitesimal strain tensor
+  int index_dRnumerator;        // summation of numerator terms in calculation of dR
+  int index_dRdenominator;      // summation of denominator terms in calculation of dR
+  int index_Acon0;              // total area involved in contacts: Acon^{n}
+  int index_Acon1;              // total area involved in contacts: Acon^{n+1}
+  int index_Atot;               // total particle area
+  int index_Atot_sum;           // running sum of contact area minus cap area
+  int index_ddelta_bar;         // change in mean surface displacement
+  int index_psi;                // ratio of free surface area to total surface area
+  int index_sigmaxx;            // xx-component of the stress tensor, not necessary forforce calculation
+  int index_sigmayy;            // yy-component of the stress tensor, not necessary forforce calculation
+  int index_sigmazz;            // zz-component of the stress tensor, not necessary forforce calculation
+  int index_history_setup_flag; // flag to check if history variables have beeninitialized
+  int index_contacts;           // total contacts on particle
+  int index_adhesive_length;    // total length of adhesive contact on a particle
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
--- a/src/GRANULAR/fix_wall_gran.cpp
+++ b/src/GRANULAR/fix_wall_gran.cpp
@ -200,7 +200,7 @@ FixWallGran::FixWallGran(LAMMPS *lmp, int narg, char **arg) :
      iarg += 3;
    } else if (strcmp(arg[iarg],"contacts") == 0) {
      peratom_flag = 1;
-      size_peratom_cols = 8;
+      size_peratom_cols = 8 + model->nsvector;
      peratom_freq = 1;
      iarg += 1;
    } else if (strcmp(arg[iarg],"temperature") == 0) {
@ -365,7 +365,7 @@ void FixWallGran::setup(int vflag)

 void FixWallGran::post_force(int /*vflag*/)
 {
-  int i,j;
+  int i,j,n;
  double dx,dy,dz,del1,del2,delxy,delr,rwall,meff;
  double *forces, *torquesi;
  double vwall[3];
@ -437,7 +437,9 @@ void FixWallGran::post_force(int /*vflag*/)

  rwall = 0.0;

+  model->calculate_svector = 0;
  if (peratom_flag) {
+    model->calculate_svector = 1;
    clear_stored_contacts();
  }

@ -546,6 +548,9 @@ void FixWallGran::post_force(int /*vflag*/)
      array_atom[i][5] = x[i][1] - dy;
      array_atom[i][6] = x[i][2] - dz;
      array_atom[i][7] = radius[i];
+
+      for (n = 0; n < model->nsvector; n++)
+        array_atom[i][8 + n] = model->svector[n];
    }
  }
 }
--- a/src/GRANULAR/fix_wall_gran.h
+++ b/src/GRANULAR/fix_wall_gran.h
@ -50,14 +50,14 @@ class FixWallGran : public Fix {
  int maxsize_restart() override;
  void reset_dt() override;

+  // for granular model choices
+  class Granular_NS::GranularModel *model;
+
 protected:
  int wallstyle, wiggle, wshear, axis;
  int nlevels_respa;
  bigint time_origin;

-  // for granular model choices
-  class Granular_NS::GranularModel *model;
-
  double lo, hi, cylradius;
  double amplitude, period, omega, vshear;
  double dt;
@ -84,7 +84,7 @@ class FixWallGran : public Fix {

  // store particle interactions

-  int store;
+  int nsvector;

  void clear_stored_contacts();
 };
--- a/src/GRANULAR/fix_wall_gran_region.cpp
+++ b/src/GRANULAR/fix_wall_gran_region.cpp
@ -118,7 +118,7 @@ void FixWallGranRegion::init()

 void FixWallGranRegion::post_force(int /*vflag*/)
 {
-  int i, m, nc, iwall;
+  int i, n, m, nc, iwall;
  double *forces, *torquesi;
  double meff, vwall[3], w0[3] = {0.0, 0.0, 0.0};
  bool touchflag = false;
@ -174,7 +174,11 @@ void FixWallGranRegion::post_force(int /*vflag*/)
    region->set_velocity();
  }

-  if (peratom_flag) clear_stored_contacts();
+  model->calculate_svector = 0;
+  if (peratom_flag) {
+    model->calculate_svector = 1;
+    clear_stored_contacts();
+  }

  // Define constant wall properties (atom j)
  model->radj = 0.0;
@ -228,6 +232,9 @@ void FixWallGranRegion::post_force(int /*vflag*/)
      model->radi = radius[i];
      model->radj = region->contact[ic].radius;
      model->r = region->contact[ic].r;
+      model->i = i;
+      model->j = ic;
+
      if (model->beyond_contact) model->touch = history_many[i][c2r[ic]][0];

      touchflag = model->check_contact();
@ -280,6 +287,9 @@ void FixWallGranRegion::post_force(int /*vflag*/)
        array_atom[i][5] = x[i][1] - model->dx[1];
        array_atom[i][6] = x[i][2] - model->dx[2];
        array_atom[i][7] = radius[i];
+
+        for (n = 0; n < model->nsvector; n++)
+          array_atom[i][8 + n] = model->svector[n];
      }
    }
  }
--- a/src/GRANULAR/fix_wall_gran_region.h
+++ b/src/GRANULAR/fix_wall_gran_region.h
@ -44,9 +44,8 @@ class FixWallGranRegion : public FixWallGran {
  int size_restart(int) override;
  int maxsize_restart() override;

- private:
  class Region *region;
-  int nregion;
+  void update_contacts(int, int);

  // shear history for multiple contacts per particle

@ -57,10 +56,11 @@ class FixWallGranRegion : public FixWallGran {
  int *c2r;                  // contact to region mapping
                             // c2r[i] = index of Ith contact in
                             //   region-contact[] list of contacts
+ private:
+
+  int nregion;
  int motion_resetflag;      // used by restart to indicate that region
                             //    vel info is to be reset
-
-  void update_contacts(int, int);
 };

 }    // namespace LAMMPS_NS
--- a/src/GRANULAR/gran_sub_mod.cpp
+++ b/src/GRANULAR/gran_sub_mod.cpp
@ -42,6 +42,7 @@ GranSubMod::GranSubMod(class GranularModel *gm, LAMMPS *lmp) : Pointers(lmp)
  beyond_contact = 0;
  num_coeffs = 0;
  contact_radius_flag = 0;
+  nsvector = 0;

  nondefault_history_transfer = 0;
  transfer_history_factor = nullptr;
--- a/src/GRANULAR/gran_sub_mod.h
+++ b/src/GRANULAR/gran_sub_mod.h
@ -46,6 +46,8 @@ namespace Granular_NS {

    GranularModel *gm;

+    int nsvector, index_svector;
+
   protected:
    int allocated;

--- a/src/GRANULAR/gran_sub_mod_heat.cpp
+++ b/src/GRANULAR/gran_sub_mod_heat.cpp
@ -50,6 +50,7 @@ GranSubModHeatRadius::GranSubModHeatRadius(GranularModel *gm, LAMMPS *lmp) : Gra
  num_coeffs = 1;
  contact_radius_flag = 1;
  conductivity = 0.0;
+  nsvector = 1;
 }

 /* ---------------------------------------------------------------------- */
@ -65,7 +66,9 @@ void GranSubModHeatRadius::coeffs_to_local()

 double GranSubModHeatRadius::calculate_heat()
 {
-  return 2 * conductivity * gm->contact_radius * (gm->Tj - gm->Ti);
+  double heat = 2 * conductivity * gm->contact_radius * (gm->Tj - gm->Ti);
+  if (gm->calculate_svector) gm->svector[index_svector] = heat;
+  return heat;
 }


@ -78,6 +81,7 @@ GranSubModHeatArea::GranSubModHeatArea(GranularModel *gm, LAMMPS *lmp) : GranSub
  num_coeffs = 1;
  contact_radius_flag = 1;
  heat_transfer_coeff = 0.0;
+  nsvector = 1;
 }

 /* ---------------------------------------------------------------------- */
@ -93,5 +97,7 @@ void GranSubModHeatArea::coeffs_to_local()

 double GranSubModHeatArea::calculate_heat()
 {
-  return heat_transfer_coeff * MY_PI * gm->contact_radius * gm->contact_radius * (gm->Tj - gm->Ti);
+  double heat = heat_transfer_coeff * MY_PI * gm->contact_radius * gm->contact_radius * (gm->Tj - gm->Ti);
+  if (gm->calculate_svector) gm->svector[index_svector] = heat;
+  return heat;
 }
--- a/src/GRANULAR/gran_sub_mod_normal.cpp
+++ b/src/GRANULAR/gran_sub_mod_normal.cpp
@ -13,25 +13,70 @@

 #include "gran_sub_mod_normal.h"

+#include "atom.h"
 #include "error.h"
+#include "citeme.h"
+#include "fix_granular_mdr.h"
 #include "granular_model.h"
 #include "math_const.h"
+#include "modify.h"
+#include "update.h"

 #include <cmath>
+#include <iomanip>
+#include <sstream>

 using namespace LAMMPS_NS;
 using namespace Granular_NS;
+using namespace MathConst;

-using MathConst::MY_2PI;
-using MathConst::MY_PI;
-
-static constexpr double PI27SQ = 266.47931882941264802866;      // 27*PI**2
+static constexpr double PISQ = 9.8696044010893579923;            // PI^2
+static constexpr double PIINV = 0.318309886183790691216;         // 1/PI
+static constexpr double PI27SQ = 266.479318829412648029;         // 27*PI^2
+static constexpr double PITOFIVETHIRDS = 6.73880859569814116838; // PI^(5/3)
+static constexpr double CBRT2 = 1.25992104989487319067;          // cbrt(2)
+static constexpr double SQRTHALFPI = 1.25331413731550012081;     // sqrt(PI/2)
+static constexpr double CBRTHALFPI = 1.16244735150962652526;     // cbrt(PI/2)
+static constexpr double FOURTHIRDS = 1.33333333333333333333;     // 4/3
 static constexpr double THREEROOT3 = 5.19615242270663202362;     // 3*sqrt(3)
 static constexpr double SIXROOT6 = 14.69693845669906728801;      // 6*sqrt(6)
 static constexpr double INVROOT6 = 0.40824829046386307274;       // 1/sqrt(6)
-static constexpr double FOURTHIRDS = (4.0 / 3.0);               // 4/3
 static constexpr double JKRPREFIX = 1.2277228507842888;          // cbrt(3*PI**2/16)

+static constexpr int MDR_MAX_IT = 100;                           // Newton-Raphson for MDR
+static constexpr double MDR_EPSILON1 = 1e-10;                    // Newton-Raphson for MDR
+static constexpr double MDR_EPSILON2 = 1e-16;                    // Newton-Raphson for MDR
+static constexpr double MDR_EPSILON3 = 1e-20;                    // For precision checks
+static constexpr double MDR_OVERLAP_LIMIT = 0.75;                // Maximum contact overlap for MDR
+
+static const char cite_mdr[] =
+    "MDR contact model command: (i) https://doi.org/10.1016/j.jmps.2023.105492 || (ii) https://doi.org/10.1016/j.jmps.2023.105493 || (iii) https://doi.org/10.31224/4289\n\n"
+    "@Article{zunker2024mechanicallyI,\n"
+    " author =  {Zunker, William and Kamrin, Ken},\n"
+    " title =   {A mechanically-derived contact model for adhesive elastic-perfectly plastic particles,\n"
+    "            Part I: Utilizing the method of dimensionality reduction},\n"
+    " journal = {Journal of the Mechanics and Physics of Solids},\n"
+    " year =    {2024},\n"
+    " volume =  {183},\n"
+    " pages =   {105492},\n"
+    "}\n\n"
+    "@Article{zunker2024mechanicallyII,\n"
+    " author =  {Zunker, William and Kamrin, Ken},\n"
+    " title =   {A mechanically-derived contact model for adhesive elastic-perfectly plastic particles,\n"
+    "            Part II: Contact under high compaction—modeling a bulk elastic response},\n"
+    " journal = {Journal of the Mechanics and Physics of Solids},\n"
+    " year =    {2024},\n"
+    " volume =  {183},\n"
+    " pages =   {105493},\n"
+    "}\n\n"
+    "@Article{zunker2025experimentally,\n"
+    " author =  {Zunker, William and Dunatunga, Sachith and Thakur, Subhash and Tang, Pingjun and Kamrin, Ken},\n"
+    " title =   {Experimentally validated DEM for large deformation powder compaction:\n"
+    "            mechanically-derived contact model and screening of non-physical contacts},\n"
+    " year =    {2025},\n"
+    " journal = {engrXiv},\n"
+    "}\n\n";
+
 /* ----------------------------------------------------------------------
   Default normal model
 ------------------------------------------------------------------------- */
@ -381,3 +426,574 @@ void GranSubModNormalJKR::set_fncrit()
 {
  Fncrit = fabs(Fne + 2.0 * F_pulloff);
 }
+
+/* ----------------------------------------------------------------------
+   MDR contact model
+
+   Contributing authors:
+   William Zunker (MIT), Sachith Dunatunga (MIT),
+   Dan Bolintineanu (SNL), Joel Clemmer (SNL)
+------------------------------------------------------------------------- */
+
+GranSubModNormalMDR::GranSubModNormalMDR(GranularModel *gm, LAMMPS *lmp) :
+    GranSubModNormal(gm, lmp)
+{
+  if (lmp->citeme) lmp->citeme->add(cite_mdr);
+
+  num_coeffs = 6;
+  contact_radius_flag = 1;
+  size_history = 26;
+  nsvector = 1;
+  fix_mdr_flag = 0;
+  id_fix = nullptr;
+
+  nondefault_history_transfer = 1;
+  transfer_history_factor = new double[size_history];
+  for (int i = 0; i < size_history; i++) {
+    transfer_history_factor[i] = +1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+GranSubModNormalMDR::~GranSubModNormalMDR()
+{
+  if (id_fix && modify->nfix)
+    modify->delete_fix(id_fix);
+  delete[] id_fix;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GranSubModNormalMDR::coeffs_to_local()
+{
+  E = coeffs[0];      // Young's modulus
+  nu = coeffs[1];     // Poisson's ratio
+  Y = coeffs[2];      // yield stress
+  gamma = coeffs[3];  // effective surface energy
+  psi_b = coeffs[4];  // bulk response trigger based on ratio of remaining free area: A_{free}/A_{total}
+  CoR = coeffs[5];    // coefficent of restitution
+
+  if (E <= 0.0) error->all(FLERR, "Illegal MDR normal model, Young's modulus must be greater than 0");
+  if (nu < 0.0 || nu > 0.5) error->all(FLERR, "Illegal MDR normal model, Poisson's ratio must be between 0 and 0.5");
+  if (Y < 0.0) error->all(FLERR, "Illegal MDR normal model, yield stress must be greater than or equal to 0");
+  if (gamma < 0.0) error->all(FLERR, "Illegal MDR normal model, effective surface energy must be greater than or equal to 0");
+  if (psi_b < 0.0 || psi_b > 1.0) error->all(FLERR, "Illegal MDR normal model, psi_b must be between 0 and 1.0");
+  if (CoR < 0.0 || CoR > 1.0) error->all(FLERR, "Illegal MDR normal model, coefficent of restitution must be between 0 and 1.0");
+
+  G = E / (2.0 * (1.0 + nu));            // shear modulus
+  kappa = E / (3.0 * (1.0 - 2.0 * nu));  // bulk modulus
+  Eeff = E / (1.0 - pow(nu, 2.0));       // composite plane strain modulus
+
+  // precomputing factors
+
+  Eeffinv = 1.0 / Eeff;
+  Eeffsq = Eeff * Eeff;
+  Eeffsqinv = Eeffinv * Eeffinv;
+
+  gammasq = gamma * gamma;
+  gamma3 = gammasq * gamma;
+  gamma4 = gammasq * gammasq;
+
+  warn_flag = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GranSubModNormalMDR::init()
+{
+  if (!fix_mdr_flag) {
+    if (modify->get_fix_by_style("GRANULAR/MDR").size() == 0) {
+      id_fix = utils::strdup("MDR");
+      modify->add_fix(fmt::format("{} all GRANULAR/MDR", id_fix));
+    }
+    fix_mdr_flag = 1;
+  }
+
+  // initialize particle history variables
+  int tmp1, tmp2;
+  index_Ro = atom->find_custom("Ro", tmp1, tmp2);                       // initial radius
+  index_Vcaps = atom->find_custom("Vcaps", tmp1, tmp2);                 // spherical cap volume from intersection of apparent radius particle and contact planes
+  index_Vgeo = atom->find_custom("Vgeo", tmp1, tmp2);                   // geometric particle volume of apparent particle after removing spherical cap volume
+  index_Velas = atom->find_custom("Velas", tmp1, tmp2);                 // particle volume from linear elasticity
+  index_eps_bar = atom->find_custom("eps_bar", tmp1, tmp2);             // volume-averaged infinitesimal strain tensor
+  index_dRnumerator = atom->find_custom("dRnumerator", tmp1, tmp2);     // summation of numerator terms in calculation of dR
+  index_dRdenominator = atom->find_custom("dRdenominator", tmp1, tmp2); // summation of denominator terms in calculation of dR
+  index_Acon0 = atom->find_custom("Acon0", tmp1, tmp2);                 // total area involved in contacts: Acon^{n}
+  index_Acon1 = atom->find_custom("Acon1", tmp1, tmp2);                 // total area involved in contacts: Acon^{n+1}
+  index_Atot = atom->find_custom("Atot", tmp1, tmp2);                   // total particle area
+  index_Atot_sum = atom->find_custom("Atot_sum", tmp1, tmp2);           // running sum of contact area minus cap area
+  index_ddelta_bar = atom->find_custom("ddelta_bar", tmp1, tmp2);       // change in mean surface displacement
+  index_psi = atom->find_custom("psi", tmp1, tmp2);                     // ratio of free surface area to total surface area
+  index_sigmaxx = atom->find_custom("sigmaxx", tmp1, tmp2);             // xx-component of the stress tensor, not necessary for force calculation
+  index_sigmayy = atom->find_custom("sigmayy", tmp1, tmp2);             // yy-component of the stress tensor, not necessary for force calculation
+  index_sigmazz = atom->find_custom("sigmazz", tmp1, tmp2);             // zz-component of the stress tensor, not necessary for force calculation
+}
+
+/* ---------------------------------------------------------------------- */
+
+double GranSubModNormalMDR::calculate_forces()
+{
+  using namespace Granular_MDR_NS;
+  // To understand the structure of the overall code it is important to consider
+  // the following:
+  //
+  // The MDR contact model was developed by imagining individual particles being
+  // squished between a number of rigid flats (references below). To allow
+  // for many interacting particles, we extend the idea of isolated particles surrounded
+  // by rigid flats. In particular, we imagine placing rigid flats at the overlaps
+  // between particles. The force is calculated seperately on both sides
+  // of the contact assuming interaction with a rigid flat. The two forces are then
+  // averaged on either side of the contact to determine the final force. If the
+  // contact is between a particle and wall then only one force evaluation is required.
+  //
+  // Zunker and Kamrin, 2024, Part I: https://doi.org/10.1016/j.jmps.2023.105492
+  // Zunker and Kamrin, 2024, Part II: https://doi.org/10.1016/j.jmps.2023.105493
+  // Zunker, Dunatunga, Thakur, Tang, and Kamrin, 2025:
+
+  double *Rinitial = atom->dvector[index_Ro];
+  double *Vgeo = atom->dvector[index_Vgeo];
+  double *Velas = atom->dvector[index_Velas];
+  double *Vcaps = atom->dvector[index_Vcaps];
+  double *eps_bar = atom->dvector[index_eps_bar];
+  double *dRnumerator = atom->dvector[index_dRnumerator];
+  double *dRdenominator = atom->dvector[index_dRdenominator];
+  double *Acon0 = atom->dvector[index_Acon0];
+  double *Acon1 = atom->dvector[index_Acon1];
+  double *Atot = atom->dvector[index_Atot];
+  double *Atot_sum = atom->dvector[index_Atot_sum];
+  double *ddelta_bar = atom->dvector[index_ddelta_bar];
+  double *psi = atom->dvector[index_psi];
+  double *sigmaxx = atom->dvector[index_sigmaxx];
+  double *sigmayy = atom->dvector[index_sigmayy];
+  double *sigmazz = atom->dvector[index_sigmazz];
+
+  const int itag_true = atom->tag[gm->i]; // true i particle tag
+  const int jtag_true = atom->tag[gm->j]; // true j particle tag
+  const int i_true = gm->i;               // true i particle index
+  const int j_true = gm->j;               // true j particle index
+  const double radi_true = gm->radi;      // true i particle initial radius
+  const double radj_true = gm->radj;      // true j particle initial radius
+
+  double F = 0.0;                         // average force
+  double F0 = 0.0;                        // force on contact side 0
+  double F1 = 0.0;                        // force on contact side 1
+  double delta = gm->delta;               // apparent overlap
+  double Ac_avg = 0.0;                    // average contact area across both sides
+
+  double *history = & gm->history[history_index]; // load in all history variables
+  int history_update = gm->history_update;
+
+  // Rigid flat placement scheme
+  double *deltamax_offset = & history[DELTA_MAX];
+  double *deltap_offset0 = & history[DELTAP_0];
+  double *deltap_offset1 = & history[DELTAP_1];
+  double deltap0 = *deltap_offset0;
+  double deltap1 = *deltap_offset1;
+
+  // always update deltamax since gm->delta won't change until initial integrate
+  //   also need to define deltamax if an atom is created with an overlap
+  double deltamaxi = *deltamax_offset;
+  if (gm->delta >= *deltamax_offset) *deltamax_offset = gm->delta;
+  double deltamax = *deltamax_offset;
+
+
+  for (int contactSide = 0; contactSide < 2; contactSide++) {
+
+    double *delta_offset, *deltao_offset, *delta_MDR_offset, *delta_BULK_offset;
+    double *deltamax_MDR_offset, *Yflag_offset, *deltaY_offset, *cA_offset, *aAdh_offset;
+    double *Ac_offset, *eps_bar_offset, *penalty_offset, *deltap_offset;
+
+    if (gm->contact_type == PAIR) {
+      // displacement partitioning only necessary for particle-particle contact
+
+      // itag and jtag persist after neighbor list builds, use tags to compare to match
+      //   contact history variables consistently across steps for a particle pair.
+      if ((contactSide == 0 && itag_true > jtag_true) || (contactSide != 0 && itag_true < jtag_true)) {
+          gm->i = i_true;
+          gm->j = j_true;
+          gm->radi = radi_true;
+          gm->radj = radj_true;
+      } else {
+          gm->i = j_true;
+          gm->j = i_true;
+          gm->radi = radj_true;
+          gm->radj = radi_true;
+      }
+
+      // determine the two maximum experienced geometric overlaps on either side of rigid flat
+      double delta_geo, delta_geo_alt;
+      double denom = 1.0 / (2.0 * (deltamax - gm->radi - gm->radj));
+      double delta_geoOpt1 = deltamax * (deltamax - 2.0 * gm->radj) * denom;
+      double delta_geoOpt2 = deltamax * (deltamax - 2.0 * gm->radi) * denom;
+      if (gm->radi < gm->radj) {
+        delta_geo = MAX(delta_geoOpt1, delta_geoOpt2);
+        delta_geo_alt = MIN(delta_geoOpt1,delta_geoOpt2);
+      } else {
+        delta_geo = MIN(delta_geoOpt1, delta_geoOpt2);
+        delta_geo_alt = MAX(delta_geoOpt1, delta_geoOpt2);
+      }
+
+      // cap displacement if exceeds the overlap limit, parition the remaining to the other side
+      if (delta_geo / gm->radi > MDR_OVERLAP_LIMIT) {
+        delta_geo = gm->radi * MDR_OVERLAP_LIMIT;
+      } else if (delta_geo_alt / gm->radj > MDR_OVERLAP_LIMIT) {
+        delta_geo = deltamax - gm->radj * MDR_OVERLAP_LIMIT;
+      }
+
+      // determine final delta used for subsequent calculations
+      double deltap = deltap0 + deltap1;
+      if (contactSide == 0) {
+        delta = delta_geo + (deltap0 - delta_geo) * (gm->delta - deltamax) / (deltap - deltamax);
+      } else {
+        delta = delta_geo + (deltap1 - delta_geo) * (gm->delta - deltamax) / (deltap - deltamax);
+      }
+    } else if (gm->contact_type != PAIR && contactSide != 0) {
+      // contact with particle-wall requires only one evaluation
+      break;
+    }
+
+    delta_offset = &history[DELTA_0 + contactSide];
+    deltao_offset = &history[DELTAO_0 + contactSide];
+    delta_MDR_offset = &history[DELTA_MDR_0 + contactSide];
+    delta_BULK_offset = &history[DELTA_BULK_0 + contactSide];
+    deltamax_MDR_offset = &history[DELTAMAX_MDR_0 + contactSide];
+    Yflag_offset = &history[YFLAG_0 + contactSide];
+    deltaY_offset = &history[DELTAY_0 + contactSide];
+    cA_offset = &history[CA_0 + contactSide];
+    aAdh_offset = &history[AADH_0 + contactSide];
+    Ac_offset = &history[AC_0 + contactSide];
+    eps_bar_offset = &history[EPS_BAR_0 + contactSide];
+    deltap_offset = &history[DELTAP_0 + contactSide];
+
+    // temporary i and j indices
+    const int i = gm->i;
+    const int j = gm->j;
+
+    // geometric property definitions
+    const double Ro = Rinitial[i];              // initial radius
+    const double R = gm->radi;                  // apparent radius
+
+    // kinematics
+    const double ddelta = delta - *delta_offset;
+    if (history_update) *delta_offset = delta;
+
+    const double deltao = delta - (R - Ro);
+    const double ddeltao = deltao - *deltao_offset;
+    if (history_update) *deltao_offset = deltao;
+
+    double ddelta_MDR, ddelta_BULK;
+    if (psi[i] < psi_b) {
+      // bulk response triggered, split displacement increment between MDR and BULK components
+      ddelta_MDR = MIN(ddelta - ddelta_bar[i], delta - *delta_MDR_offset);
+      ddelta_BULK = ddelta_bar[i];
+    } else {
+      // no bulk response, full displacement increment goes to the MDR component
+      ddelta_BULK = 0.0;
+      ddelta_MDR = ddelta;
+    }
+
+    // calculate and update MDR/BULK displacements
+    const double delta_MDR = *delta_MDR_offset + ddelta_MDR;
+    if (history_update) *delta_MDR_offset = delta_MDR;
+    const double delta_BULK = MAX(0.0, *delta_BULK_offset + ddelta_BULK);
+    if (history_update) *delta_BULK_offset = delta_BULK;
+
+    if (delta_MDR > *deltamax_MDR_offset) *deltamax_MDR_offset = delta_MDR;
+    const double deltamax_MDR = *deltamax_MDR_offset;
+
+    // average pressure along yield surface
+    const double pY = Y * (1.75 * exp(-4.4 * deltamax_MDR / R) + 1.0);
+
+    if (*Yflag_offset == 0.0 && delta_MDR >= deltamax_MDR) {
+    const double phertz = 4 * Eeff * sqrt(delta_MDR) / (3 * MY_PI * sqrt(R));
+      if (!history_update && warn_flag && deltamaxi == 0 && phertz > pY) {
+        error->warning(FLERR, "The newly inserted particles have pre-existing overlaps that "
+                          "have caused immediate plastic deformation. This could lead to "
+                          "non-physical results in the MDR model, as it handles some aspects "
+                          "related to plastic deformation incrementally.");
+        warn_flag = 0;
+      }
+      if (history_update && phertz > pY) {
+        *Yflag_offset = 1.0;
+        *deltaY_offset = delta_MDR;
+        *cA_offset = MY_PI * (pow(*deltaY_offset, 2) - *deltaY_offset * R);
+      }
+    }
+
+    // MDR force calculation
+    double F_MDR;
+    double A, Ainv;               // height of elliptical indenter
+    double B;                     // width of elliptical indenter
+    double deltae1D;              // transformed elastic displacement
+    double deltaR;                // displacement correction
+    double amax, amaxsq;          // maximum experienced contact radius
+    const double cA = *cA_offset; // contact area intercept
+
+    if (*Yflag_offset == 0.0) {
+      // elastic contact
+      A = 4.0 * R;
+      Ainv = 1.0 / A;
+      B = 2.0 * R;
+      deltae1D = delta_MDR;
+      amax = sqrt(deltamax_MDR * R);
+    } else {
+      // plastic contact
+      amax = sqrt(2.0 * deltamax_MDR * R - pow(deltamax_MDR, 2) + cA * PIINV);
+      amaxsq = amax * amax;
+      A = 4.0 * pY * Eeffinv * amax;
+      Ainv = 1.0 / A;
+      B = 2.0 * amax;
+
+      // maximum transformed elastic displacement
+      const double deltae1Dmax = A * 0.5;
+
+      // force caused by full submersion of elliptical indenter to depth of A/2
+      double Fmax = Eeff * (A * B * 0.25) * acos(1 - 2 * deltae1Dmax * Ainv);
+      Fmax -= (2 - 4 * deltae1Dmax * Ainv) * sqrt(deltae1Dmax * Ainv - pow(deltae1Dmax * Ainv, 2));
+
+      // depth of particle center
+      const double zR = R - (deltamax_MDR - deltae1Dmax);
+
+      deltaR = 2 * amaxsq * (-1 + nu) - (-1 + 2 * nu) * zR * (-zR + sqrt(amaxsq + pow(zR, 2)));
+      deltaR *= Fmax / (MY_2PI * amaxsq * G * sqrt(amaxsq + pow(zR, 2)));
+
+      // transformed elastic displacement
+      deltae1D = (delta_MDR - deltamax_MDR + deltae1Dmax + deltaR) / (1 + deltaR / deltae1Dmax);
+
+      // added for rigid flat placement
+      if (history_update) *deltap_offset = deltamax_MDR - (deltae1Dmax + deltaR);
+    }
+
+    double a_na;
+    double a_fac = 0.99;
+    (deltae1D >= 0.0) ? a_na = B * sqrt(A - deltae1D) * sqrt(deltae1D) * Ainv : a_na = 0.0;
+    double aAdh = *aAdh_offset;
+    if (aAdh > a_fac * amax) aAdh = a_fac * amax;
+
+    double Ainvsq = Ainv * Ainv;
+    double Asq = A * A;
+    double A3 = Asq * A;
+    double A4 = Asq * Asq;
+
+    double Binv = 1.0 / B;
+    double Bsq = B * B;
+    double B4 = Bsq * Bsq;
+
+    if (gamma <= 0.0) {
+      // non-adhesive contact
+
+      if (deltae1D <= 0.0) {
+        F_MDR = 0.0;
+      } else {
+        F_MDR = calculate_nonadhesive_mdr_force(deltae1D, Ainv, Eeff, A, B);
+      }
+
+      if (std::isnan(F_MDR)) {
+        error->one(FLERR, "F_MDR is NaN, non-adhesive case");
+      }
+
+      if (history_update) *aAdh_offset = a_na;
+    } else {
+      // adhesive contact
+      double g_aAdh;
+
+      if (delta_MDR == deltamax_MDR || a_na >= aAdh) {
+        // case 1: no tensile springs, purely compressive contact
+
+        if (deltae1D <= 0.0) {
+          F_MDR = 0.0;
+        } else {
+          F_MDR = calculate_nonadhesive_mdr_force(deltae1D, Ainv, Eeff, A, B);
+        }
+
+        if (std::isnan(F_MDR))
+          error->one(FLERR, "F_MDR is NaN, case 1: no tensile springs");
+
+        if (history_update) *aAdh_offset = a_fac * a_na;
+      } else {
+        // case 2+3, tensile springs
+        const double lmax = sqrt(MY_2PI * aAdh * gamma * Eeffinv);
+        g_aAdh = A * 0.5 - A * Binv * sqrt(Bsq * 0.25 - pow(aAdh, 2));
+        g_aAdh = round_up_negative_epsilon(g_aAdh);
+
+        double tmp = 27 * A4 * B4 * gamma * Eeffinv;
+        tmp -= 2 * pow(B, 6) * gamma3 * PISQ * pow(Eeffinv, 3);
+        tmp += sqrt(27) * Asq * B4 * sqrt(27 * A4 * Eeffsq * gammasq - 4 * Bsq * gamma4 * PISQ) * Eeffsqinv;
+        tmp = cbrt(tmp);
+
+        double acrit = -Bsq * gamma * MY_PI * Ainvsq * Eeffinv;
+        acrit += CBRT2 * B4 * gammasq * PITOFIVETHIRDS / (Asq * Eeffsq * tmp);
+        acrit += CBRTHALFPI * tmp * Ainvsq;
+        acrit /= 6;
+
+        if ((deltae1D + lmax - g_aAdh) >= 0.0) {
+          // case 2: tensile springs do not exceed critical length --> deltae + lmax - g(aAdhes) >= 0
+          const double deltaeAdh = g_aAdh;
+          const double F_na = calculate_nonadhesive_mdr_force(deltaeAdh, Ainv, Eeff, A, B);
+          const double F_Adhes = 2.0 * Eeff * (deltae1D - deltaeAdh) * aAdh;
+          F_MDR = F_na + F_Adhes;
+          if (std::isnan(F_MDR))
+            error->one(FLERR, "F_MDR is NaN, case 2: tensile springs, but not exceeding critical length");
+        } else {
+          // case 3: tensile springs exceed critical length --> deltae + lmax - g(aAdhes) = 0
+
+          if (aAdh < acrit) {
+            aAdh = 0.0;
+            F_MDR = 0.0;
+          } else {
+            // newton-raphson to find aAdh
+            double aAdh_tmp = aAdh;
+            double fa, fa2, fa_tmp, dfda;
+            for (int lv1 = 0; lv1 < MDR_MAX_IT; ++lv1) {
+              fa_tmp = deltae1D - A * 0.5 + A * sqrt(Bsq * 0.25 - pow(aAdh_tmp, 2)) * Binv;
+              fa = fa_tmp + sqrt(MY_2PI * aAdh_tmp * gamma * Eeffinv);
+              if (abs(fa) < MDR_EPSILON1) {
+                break;
+              }
+              dfda = -aAdh_tmp * A / (B * sqrt(-pow(aAdh_tmp, 2) + Bsq * 0.25));
+              dfda += gamma * SQRTHALFPI / sqrt(aAdh_tmp * gamma * Eeff);
+              aAdh_tmp = aAdh_tmp - fa / dfda;
+              fa2 = fa_tmp + sqrt(MY_2PI * aAdh_tmp * gamma * Eeffinv);
+              if (abs(fa - fa2) < MDR_EPSILON2) {
+                break;
+              }
+              if (lv1 == MDR_MAX_IT - 1) {
+                aAdh_tmp = 0.0;
+              }
+            }
+            aAdh = aAdh_tmp;
+
+            g_aAdh = A * 0.5 - A * Binv * sqrt(Bsq * 0.25 - pow(aAdh, 2));
+            g_aAdh = round_up_negative_epsilon(g_aAdh);
+
+            const double deltaeAdh = g_aAdh;
+            const double F_na = calculate_nonadhesive_mdr_force(deltaeAdh, Ainv, Eeff, A, B);
+            const double F_Adhes = 2.0 * Eeff * (deltae1D - deltaeAdh) * aAdh;
+            F_MDR = F_na + F_Adhes;
+            if (std::isnan(F_MDR))
+              error->one(FLERR, "F_MDR is NaN, case 3: tensile springs exceed critical length");
+          }
+          if (history_update) *aAdh_offset = aAdh;
+        }
+      }
+    }
+
+    // contact penalty scheme
+    penalty_offset = &history[PENALTY];
+    double pij = *penalty_offset;
+    const double wij = MAX(1.0 - pij, 0.0);
+
+    // area related calculations
+    double Ac;
+    (*Yflag_offset == 0.0) ? Ac = MY_PI * delta * R : Ac = MY_PI * (2.0 * delta * R - pow(delta, 2)) + cA;
+    if (Ac < 0.0) Ac = 0.0;
+    if (history_update) {
+      Atot_sum[i] += wij * (Ac - MY_2PI * R * (deltamax_MDR + delta_BULK));
+      Acon1[i] += wij * Ac;
+    }
+    Ac_avg += wij * Ac;
+
+    // bulk force calculation
+    double F_BULK;
+    (delta_BULK <= 0.0) ? F_BULK = 0.0 : F_BULK = (1.0 / Vgeo[i]) * Acon0[i] * delta_BULK * kappa * Ac;
+
+    // total force calculation
+    (contactSide == 0) ? F0 = F_MDR + F_BULK : F1 = F_MDR + F_BULK;
+
+    if (history_update) {
+      // mean surface displacement calculation
+      *Ac_offset = wij * Ac;
+
+      // radius update scheme quantity calculation
+      Vcaps[i] += MY_PI * THIRD * pow(delta, 2) * (3.0 * R - delta);
+    }
+
+    const double Fntmp = wij * (F_MDR + F_BULK);
+    const double fx = Fntmp * gm->nx[0];
+    const double fy = Fntmp * gm->nx[1];
+    const double fz = Fntmp * gm->nx[2];
+    const double bx = -(Ro - deltao) * gm->nx[0];
+    const double by = -(Ro - deltao) * gm->nx[1];
+    const double bz = -(Ro - deltao) * gm->nx[2];
+    const double eps_bar_contact = (fx * bx + fy * by + fz * bz) / (3 * kappa * Velas[i]);
+    if (history_update) eps_bar[i] += eps_bar_contact;
+
+    double desp_bar_contact = eps_bar_contact - *eps_bar_offset;
+    if (history_update && delta_MDR == deltamax_MDR && *Yflag_offset > 0.0 && F_MDR > 0.0) {
+      const double Vo = FOURTHIRDS * MY_PI * pow(Ro, 3);
+      dRnumerator[i] -= Vo * (eps_bar_contact - *eps_bar_offset);
+      dRnumerator[i] -= wij * MY_PI * ddeltao * (2 * deltao * Ro - pow(deltao, 2) + pow(R, 2) - pow(Ro, 2));
+      dRdenominator[i] += wij * 2.0 * MY_PI * R * (deltao + R - Ro);
+    }
+
+    if (history_update) {
+      *eps_bar_offset = eps_bar_contact;
+      sigmaxx[i] += fx * bx / Velas[i];
+      sigmayy[i] += fy * by / Velas[i];
+      sigmazz[i] += fz * bz / Velas[i];
+    }
+  }
+
+  // save contact area
+  if (gm->calculate_svector) gm->svector[index_svector] = Ac_avg * 0.5;
+
+  gm->i = i_true;
+  gm->j = j_true;
+  gm->radi = radi_true;
+  gm->radj = radj_true;
+
+  double *penalty_offset = &history[PENALTY];
+  const double pij = *penalty_offset;
+  const double wij = MAX(1.0 - pij, 0.0);
+
+  // assign final force
+  if (gm->contact_type != PAIR) {
+    F = wij * F0;
+  } else {
+    F = wij * (F0 + F1) * 0.5;
+  }
+
+  // calculate damping force
+  if (F > 0.0) {
+    double Eeff2;
+    double Reff2;
+    if (gm->contact_type == PAIR) {
+      Eeff2 = E / (2.0 * (1.0 - pow(nu, 2)));
+      Reff2 = 1.0 / ((1.0 / gm->radi + 1.0 / gm->radj));
+    } else {
+      Eeff2 = E / (1.0 - pow(nu, 2));
+      Reff2 = gm->radi;
+    }
+    const double kn = Eeff2 * Reff2;
+    const double beta = -log(CoR) / sqrt(pow(log(CoR), 2) + PISQ);
+    const double damp_prefactor = beta * sqrt(gm->meff * kn);
+    const double F_DAMP = -damp_prefactor * gm->vnnr;
+
+    F += wij * F_DAMP;
+  }
+
+  return F;
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+double GranSubModNormalMDR::calculate_nonadhesive_mdr_force(double delta, double Ainv, double Eeff, double A, double B)
+{
+  double F_na = acos(1.0 - 2.0 * delta * Ainv);
+  F_na -= (2 - 4 * delta * Ainv) * sqrt(delta * Ainv - pow(delta * Ainv, 2));
+  F_na *= 0.25 * Eeff * A * B;
+
+  return F_na;
+}
+
+/* ----------------------------------------------------------------------
+   round values within (-EPSILON,0.0) due to machine precision errors to zero
+------------------------------------------------------------------------- */
+
+double GranSubModNormalMDR::round_up_negative_epsilon(double value)
+{
+  if (value < 0.0 && value > -MDR_EPSILON3) value = 0.0;
+  return value;
+}
--- a/src/GRANULAR/gran_sub_mod_normal.h
+++ b/src/GRANULAR/gran_sub_mod_normal.h
@ -19,6 +19,7 @@ GranSubModStyle(hertz,GranSubModNormalHertz,NORMAL);
 GranSubModStyle(hertz/material,GranSubModNormalHertzMaterial,NORMAL);
 GranSubModStyle(dmt,GranSubModNormalDMT,NORMAL);
 GranSubModStyle(jkr,GranSubModNormalJKR,NORMAL);
+GranSubModStyle(mdr,GranSubModNormalMDR,NORMAL);
 // clang-format on
 #else

@ -133,6 +134,35 @@ namespace Granular_NS {
    int mixed_coefficients;
  };

+  /* ---------------------------------------------------------------------- */
+
+  class GranSubModNormalMDR : public GranSubModNormal {
+   public:
+    GranSubModNormalMDR(class GranularModel *, class LAMMPS *);
+    ~GranSubModNormalMDR() override;
+    void coeffs_to_local() override;
+    void init() override;
+    double calculate_forces() override;
+    double E, nu, Y, gamma, CoR, psi_b; // specified coeffs
+
+   protected:
+    double G, kappa, Eeff; // derived coeffs
+    double Eeffsq, Eeffinv, Eeffsqinv;
+    double gammasq, gamma3, gamma4;
+
+    int warn_flag;
+
+    int index_Ro, index_Vgeo, index_Velas, index_Vcaps, index_eps_bar, index_dRnumerator;
+    int index_dRdenominator, index_Acon0, index_Acon1, index_Atot, index_Atot_sum, index_ddelta_bar;
+    int index_psi, index_sigmaxx, index_sigmayy, index_sigmazz, index_contacts, index_adhesive_length;
+    int fix_mdr_flag;
+
+    char *id_fix;
+
+    inline double calculate_nonadhesive_mdr_force(double, double, double, double, double);
+    inline double round_up_negative_epsilon(double);
+  };
+
 }    // namespace Granular_NS
 }    // namespace LAMMPS_NS

--- a/src/GRANULAR/granular_model.cpp
+++ b/src/GRANULAR/granular_model.cpp
@ -27,6 +27,7 @@
 #include "force.h"
 #include "gran_sub_mod.h"
 #include "math_extra.h"
+#include "memory.h"

 #include "style_gran_sub_mod.h"    // IWYU pragma: keep

@ -64,6 +65,10 @@ GranularModel::GranularModel(LAMMPS *lmp) : Pointers(lmp)
  twisting_model = nullptr;
  heat_model = nullptr;

+  calculate_svector = 0;
+  nsvector = 0;
+  svector = nullptr;
+
  for (int i = 0; i < NSUBMODELS; i++) sub_models[i] = nullptr;
  transfer_history_factor = nullptr;

@ -100,6 +105,7 @@ GranularModel::~GranularModel()
  delete[] gran_sub_mod_class;
  delete[] gran_sub_mod_names;
  delete[] gran_sub_mod_types;
+  delete[] svector;

  for (int i = 0; i < NSUBMODELS; i++) delete sub_models[i];
 }
@ -243,7 +249,12 @@ void GranularModel::init()

  // Must have valid normal, damping, and tangential models
  if (normal_model->name == "none") error->all(FLERR, "Must specify normal granular model");
+  if (normal_model->name == "mdr") {
+     if (damping_model->name != "none")
+       error->all(FLERR, "MDR require 'none' damping model. To damp, specify a coefficient of restitution < 1.");
+  } else {
    if (damping_model->name == "none") error->all(FLERR, "Must specify damping granular model");
+  }
  if (tangential_model->name == "none") error->all(FLERR, "Must specify tangential granular model");

  // Twisting, rolling, and heat are optional
@ -293,6 +304,21 @@ void GranularModel::init()
  }

  for (int i = 0; i < NSUBMODELS; i++) sub_models[i]->init();
+
+  nsvector = 0;
+  int index_svector = 0;
+  for (int i = 0; i < NSUBMODELS; i++) {
+    if (sub_models[i]->nsvector != 0) {
+      sub_models[i]->index_svector = index_svector;
+      nsvector += sub_models[i]->nsvector;
+      index_svector += sub_models[i]->nsvector;
+    }
+  }
+
+  if (nsvector != 0) {
+    delete[] svector;
+    svector = new double[nsvector];
+  }
 }

 /* ---------------------------------------------------------------------- */
@ -493,10 +519,9 @@ void GranularModel::calculate_forces()
    if (contact_type == PAIR) sub3(torquesj, tortwist, torquesj);
  }

-  if (heat_defined) {
+  if (heat_defined)
    dq = heat_model->calculate_heat();
 }
-}

 /* ----------------------------------------------------------------------
   compute pull-off distance (beyond contact) for a given radius and atom type
--- a/src/GRANULAR/granular_model.h
+++ b/src/GRANULAR/granular_model.h
@ -74,6 +74,9 @@ class GranularModel : protected Pointers {
  int beyond_contact, limit_damping, history_update;
  ContactType contact_type;

+  // Particle identifiers
+  int i, j, itype, jtype;
+
  // History variables
  int size_history, nondefault_history_transfer;
  double *transfer_history_factor;
@ -93,6 +96,10 @@ class GranularModel : protected Pointers {
  double magtwist;
  bool touch;

+  // Extra output
+  int calculate_svector, nsvector;
+  double *svector;
+
 protected:
  int rolling_defined, twisting_defined, heat_defined; // Flag optional sub models
  int classic_model;                                   // Flag original pair/gran calculations
--- a/src/GRANULAR/pair_granular.cpp
+++ b/src/GRANULAR/pair_granular.cpp
@ -199,6 +199,11 @@ void PairGranular::compute(int eflag, int vflag)
      model->xj = x[j];
      model->radi = radius[i];
      model->radj = radius[j];
+      model->i = i;
+      model->j = j;
+      model->itype = itype;
+      model->jtype = jtype;
+
      if (use_history) model->touch = touch[jj];

      touchflag = model->check_contact();
@ -412,8 +417,10 @@ void PairGranular::init_style()
      error->all(FLERR,"Heat conduction in pair granular requires atom style with heatflow property");
  }

-  // allocate history and initialize models
+  // allocate history and aggregate model information
  class GranularModel* model;
+  double nsvector_total;
+  extra_svector = 0;
  int size_max[NSUBMODELS] = {0};
  for (int n = 0; n < nmodels; n++) {
    model = models_list[n];
@ -424,13 +431,23 @@ void PairGranular::init_style()
    }
    if (model->size_history != 0) use_history = 1;

-    for (int i = 0; i < NSUBMODELS; i++)
+    nsvector_total = 0;
+    for (int i = 0; i < NSUBMODELS; i++) {
+      nsvector_total += model->sub_models[i]->nsvector;
      if (model->sub_models[i]->size_history > size_max[i])
        size_max[i] = model->sub_models[i]->size_history;
+    }
+    extra_svector = MAX(extra_svector, nsvector_total);

    if (model->nondefault_history_transfer) nondefault_history_transfer = 1;
  }

+  if (extra_svector != 0) {
+    single_extra = 12 + extra_svector;
+    delete[] svector;
+    svector = new double[single_extra];
+  }
+
  size_history = 0;
  if (use_history) {
    for (int i = 0; i < NSUBMODELS; i++) size_history += size_max[i];
@ -711,6 +728,10 @@ double PairGranular::single(int i, int j, int itype, int jtype,
  model->xj = x[j];
  model->radi = radius[i];
  model->radj = radius[j];
+  model->i = i;
+  model->j = j;
+  model->itype = itype;
+  model->jtype = jtype;
  model->history_update = 0; // Don't update history

  // If history is needed
@ -765,7 +786,9 @@ double PairGranular::single(int i, int j, int itype, int jtype,
  model->omegaj = omega[j];
  model->history = history;

+  model->calculate_svector = 1;
  model->calculate_forces();
+  model->calculate_svector = 0;

  // apply forces & torques
  // Calculate normal component, normalized by r
@ -785,6 +808,14 @@ double PairGranular::single(int i, int j, int itype, int jtype,
  svector[10] = model->dx[1];
  svector[11] = model->dx[2];

+  // add submodel-specific quantities
+  for (int n = 0; n < model->nsvector; n++)
+    svector[12 + n] = model->svector[n];
+
+  // zero any values unused by this specific model
+  for (int n = 12 + model->nsvector; n < single_extra; n++)
+    svector[n] = 0.0;
+
  return 0.0;
 }

--- a/src/GRANULAR/pair_granular.h
+++ b/src/GRANULAR/pair_granular.h
@ -46,6 +46,12 @@ class PairGranular : public Pair {
  double memory_usage() override;
  double atom2cut(int) override;
  double radii2cut(double, double) override;
+  int get_size_history() const { return size_history; }
+
+  // granular models
+  class Granular_NS::GranularModel** models_list;
+  int **types_indices;
+  int nmodels, maxmodels;

 protected:
  int freeze_group_bit;
@ -73,14 +79,11 @@ class PairGranular : public Pair {
  int size_history;
  int heat_flag;

-  // granular models
-  int nmodels, maxmodels;
-  class Granular_NS::GranularModel **models_list;
-  int **types_indices;
-
  // optional user-specified global cutoff, per-type user-specified cutoffs
  double **cutoff_type;
  double cutoff_global;
+
+  int extra_svector;
 };

 }    // namespace LAMMPS_NS
--- a/src/INTEL/verlet_lrt_intel.cpp
+++ b/src/INTEL/verlet_lrt_intel.cpp
@ -94,7 +94,7 @@ void VerletLRTIntel::setup(int flag)
  if (comm->me == 0 && screen) {
    fputs("Setting up VerletLRTIntel run ...\n",screen);
    if (flag) {
-      fmt::print(screen,"  Unit style    : {}\n"
+      utils::print(screen,"  Unit style    : {}\n"
                        "  Current step  : {}\n"
                        "  Time step     : {}\n",
                 update->unit_style,update->ntimestep,update->dt);
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@ -119,6 +119,14 @@ action compute_composition_atom_kokkos.cpp compute_composition_atom.cpp
 action compute_composition_atom_kokkos.h compute_composition_atom.h
 action compute_orientorder_atom_kokkos.cpp
 action compute_orientorder_atom_kokkos.h
+action compute_sna_grid_kokkos.cpp compute_sna_grid.cpp
+action compute_sna_grid_kokkos.h compute_sna_grid.h
+action compute_sna_grid_kokkos_impl.h compute_sna_grid.cpp
+action compute_sna_grid_local_kokkos.cpp compute_sna_grid_local.cpp
+action compute_sna_grid_local_kokkos.h compute_sna_grid_local.h
+action compute_sna_grid_local_kokkos_impl.h compute_sna_grid_local.cpp
+action compute_gaussian_grid_local_kokkos.cpp compute_gaussian_grid_local.cpp
+action compute_gaussian_grid_local_kokkos.h compute_gaussian_grid_local.h
 action compute_temp_deform_kokkos.cpp
 action compute_temp_deform_kokkos.h
 action compute_temp_kokkos.cpp
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
@ -0,0 +1,308 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Drew Rohskopf (SNL)
+------------------------------------------------------------------------- */
+
+#include "compute_gaussian_grid_local_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include "pair.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMPS *lmp, int narg, char **arg) :
+  ComputeGaussianGridLocal(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
+  auto d_cutsq = k_cutsq.template view<DeviceType>();
+  rnd_cutsq = d_cutsq;
+
+  host_flag = (execution_space == Host);
+
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = 1; j <= atom->ntypes; j++){
+      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq[i][j]; //cutsq_tmp;
+      k_cutsq.template modify<LMPHostType>();
+    }
+  }
+  // Set up element lists
+  int n = atom->ntypes;
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",n);
+  MemKK::realloc_kokkos(d_sigmaelem,"ComputeSNAGridKokkos::sigmaelem",n+1);
+  MemKK::realloc_kokkos(d_prefacelem,"ComputeSNAGridKokkos::prefacelem",n+1);
+  MemKK::realloc_kokkos(d_argfacelem,"ComputeSNAGridKokkos::argfacelem",n+1);
+  MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
+  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
+  auto h_sigmaelem = Kokkos::create_mirror_view(d_sigmaelem);
+  auto h_prefacelem = Kokkos::create_mirror_view(d_prefacelem);
+  auto h_argfacelem = Kokkos::create_mirror_view(d_argfacelem);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+  // start from index 1 because of how compute sna/grid is
+  for (int i = 1; i <= atom->ntypes; i++) {
+    h_radelem(i-1) = radelem[i];
+    h_sigmaelem(i-1) = sigmaelem[i];
+    h_prefacelem(i-1) = prefacelem[i];
+    h_argfacelem(i-1) = argfacelem[i];
+  }
+  Kokkos::deep_copy(d_radelem,h_radelem);
+  Kokkos::deep_copy(d_sigmaelem,h_sigmaelem);
+  Kokkos::deep_copy(d_prefacelem, h_prefacelem);
+  Kokkos::deep_copy(d_argfacelem, h_argfacelem);
+  Kokkos::deep_copy(d_map,h_map);
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+ComputeGaussianGridLocalKokkos<DeviceType>::~ComputeGaussianGridLocalKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  memoryKK->destroy_kokkos(k_alocal,alocal);
+  //gridlocal_allocated = 0;
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void ComputeGaussianGridLocalKokkos<DeviceType>::setup()
+{
+
+  ComputeGridLocal::setup();
+
+  // allocate arrays
+  memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal");
+  array_local = alocal;
+  d_alocal = k_alocal.template view<DeviceType>();
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void ComputeGaussianGridLocalKokkos<DeviceType>::init()
+{
+  ComputeGaussianGridLocal::init();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
+{
+  if (host_flag) {
+    return;
+  }
+
+  invoked_local = update->ntimestep;
+
+  copymode = 1;
+
+  zlen = nzhi-nzlo+1;
+  ylen = nyhi-nylo+1;
+  xlen = nxhi-nxlo+1;
+  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  k_cutsq.template sync<DeviceType>();
+
+  // max_neighs is defined here - think of more elaborate methods.
+  max_neighs = 100;
+
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
+  // number of atoms.
+  ntotal = atomKK->nlocal + atomKK->nghost;
+  // Allocate view for number of neighbors per grid point
+  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
+
+  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
+  // `total_range` is the number of grid points which may be larger than chunk size.
+  // printf(">>> total_range: %d\n", total_range);
+  chunksize = 32768; // 100*32768
+  chunk_size = MIN(chunksize, total_range);
+  chunk_offset = 0;
+
+  int vector_length_default = 1;
+  int team_size_default = 1;
+  if (!host_flag)
+    team_size_default = 1; // cost will increase with increasing team size //32;//max_neighs;
+
+  if (triclinic){
+    h0 = domain->h[0];
+    h1 = domain->h[1];
+    h2 = domain->h[2];
+    h3 = domain->h[3];
+    h4 = domain->h[4];
+    h5 = domain->h[5];
+    lo0 = domain->boxlo[0];
+    lo1 = domain->boxlo[1];
+    lo2 = domain->boxlo[2];
+  }
+
+  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
+
+    if (chunk_size > total_range - chunk_offset)
+      chunk_size = total_range - chunk_offset;
+
+    //Neigh
+    {
+      int vector_length = vector_length_default;
+      int team_size = team_size_default;
+      check_team_size_for<TagComputeGaussianGridLocalNeigh>(chunk_size,team_size,vector_length);
+      typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh> policy_neigh(chunk_size,team_size,vector_length);
+      Kokkos::parallel_for("ComputeGaussianGridLocalNeigh",policy_neigh,*this);
+    }
+
+    // Proceed to the next chunk.
+    chunk_offset += chunk_size;
+  } // end while
+
+  copymode = 0;
+
+  k_alocal.template modify<DeviceType>();
+  k_alocal.template sync<LMPHostType>();
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void ComputeGaussianGridLocalKokkos<DeviceType>::operator() (TagComputeGaussianGridLocalNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh>::member_type& team) const
+{
+  const int ii = team.league_rank();
+
+  if (ii >= chunk_size) return;
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  //int igrid = iz * (nx * ny) + iy * nx + ix;
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  // multiply grid integers by grid spacing delx, dely, delz
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+
+  // Zeroing out the components, which are filled as a sum.
+  for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
+    d_alocal(igrid, icol) = 0.0;
+  }
+
+  // Fill grid info columns
+  d_alocal(igrid, 0) = ix;
+  d_alocal(igrid, 1) = iy;
+  d_alocal(igrid, 2) = iz;
+  d_alocal(igrid, 3) = xtmp;
+  d_alocal(igrid, 4) = ytmp;
+  d_alocal(igrid, 5) = ztmp;
+
+  // Looping over ntotal for now.
+  for (int j = 0; j < ntotal; j++){
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    if (rsq < rnd_cutsq(jtype, jtype) ) {
+      int icol = size_local_cols_base + jtype - 1;
+      d_alocal(igrid, icol) += d_prefacelem(jtype-1) * exp(-rsq * d_argfacelem(jtype-1));
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   check max team size
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<class TagStyle>
+void ComputeGaussianGridLocalKokkos<DeviceType>::check_team_size_for(int inum, int &team_size, int vector_length) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+namespace LAMMPS_NS {
+template class ComputeGaussianGridLocalKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeGaussianGridLocalKokkos<LMPHostType>;
+#endif
+}
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
@ -0,0 +1,96 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(gaussian/grid/local/kk,ComputeGaussianGridLocalKokkos<LMPDeviceType>);
+ComputeStyle(gaussian/grid/local/kk/device,ComputeGaussianGridLocalKokkos<LMPDeviceType>);
+ComputeStyle(gaussian/grid/local/kk/host,ComputeGaussianGridLocalKokkos<LMPHostType>);
+// clang-format on
+
+#else
+
+#ifndef LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_KOKKOS_H
+#define LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_KOKKOS_H
+
+#include "compute_gaussian_grid_local.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+// clang-format off
+struct TagComputeGaussianGridLocalNeigh{};
+// clang-format on
+
+template <class DeviceType> class ComputeGaussianGridLocalKokkos : public ComputeGaussianGridLocal {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  // Static team/tile sizes for device offload
+
+#ifdef KOKKOS_ENABLE_HIP
+  static constexpr int team_size_compute_neigh = 2;
+#else
+  static constexpr int team_size_compute_neigh = 4;
+#endif
+
+  ComputeGaussianGridLocalKokkos(class LAMMPS *, int, char **);
+  ~ComputeGaussianGridLocalKokkos() override;
+  void setup() override;
+  void init() override;
+  void compute_local() override;
+
+  template<class TagStyle>
+  void check_team_size_for(int, int&, int);
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeGaussianGridLocalNeigh, const typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh>::member_type& team) const;
+
+ private:
+  Kokkos::View<double*, DeviceType> d_radelem;              // element radii
+  Kokkos::View<double*, DeviceType> d_sigmaelem;
+  Kokkos::View<double*, DeviceType> d_prefacelem;
+  Kokkos::View<double*, DeviceType> d_argfacelem;
+  Kokkos::View<int*, DeviceType> d_ninside;                // ninside for all atoms in list
+  Kokkos::View<int*, DeviceType> d_map;                    // mapping from atom types to elements
+
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq;
+  typedef Kokkos::View<const F_FLOAT**, DeviceType,
+      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
+  t_fparams_rnd rnd_cutsq;
+
+
+  int max_neighs, inum, chunk_size, chunk_offset;
+  int host_flag;
+  int total_range; // total number of loop iterations in grid
+  int xlen, ylen, zlen;
+  int chunksize;
+  int ntotal;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_int_1d_randomread type;
+
+  DAT::tdual_float_2d k_alocal;
+  typename AT::t_float_2d d_alocal;
+
+  // triclinic vars
+  double h0, h1, h2, h3, h4, h5;
+  double lo0, lo1, lo2;
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
--- a/src/KOKKOS/compute_sna_grid_kokkos.cpp
+++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp
@ -0,0 +1,25 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_kokkos.h"
+#include "compute_sna_grid_kokkos_impl.h"
+
+namespace LAMMPS_NS {
+
+template class ComputeSNAGridKokkosDevice<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeSNAGridKokkosHost<LMPHostType>;
+#endif
+
+}
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@ -0,0 +1,297 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(sna/grid/kk,ComputeSNAGridKokkosDevice<LMPDeviceType>);
+ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkosDevice<LMPDeviceType>);
+#ifdef LMP_KOKKOS_GPU
+ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosHost<LMPHostType>);
+#else
+ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice<LMPHostType>);
+#endif
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H
+#define LMP_COMPUTE_SNA_GRID_KOKKOS_H
+
+#include "compute_sna_grid.h"
+#include "kokkos_type.h"
+#include "sna_kokkos.h"
+
+namespace LAMMPS_NS {
+
+// Routines for both the CPU and GPU backend
+
+// GPU backend only
+struct TagCSNAGridComputeNeigh{};
+struct TagCSNAGridComputeCayleyKlein{};
+struct TagCSNAGridPreUi{};
+struct TagCSNAGridComputeUiSmall{}; // more parallelism, more divergence
+struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence
+struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
+template <bool chemsnap> struct TagCSNAGridComputeZi{};
+template <bool chemsnap> struct TagCSNAGridComputeBi{};
+struct TagCSNAGridLocalFill{}; // fill the gridlocal array
+
+struct TagComputeSNAGridLoop{};
+struct TagComputeSNAGrid3D{};
+
+// CPU backend only
+struct TagComputeSNAGridLoopCPU{};
+
+//template<class DeviceType>
+template<class DeviceType, typename real_type_, int vector_length_>
+class ComputeSNAGridKokkos : public ComputeSNAGrid {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  static constexpr int vector_length = vector_length_;
+  using real_type = real_type_;
+  using complex = SNAComplex<real_type>;
+
+  // Static team/tile sizes for device offload
+
+#ifdef KOKKOS_ENABLE_HIP
+  static constexpr int team_size_compute_neigh = 2;
+  static constexpr int tile_size_compute_ck = 2;
+  static constexpr int tile_size_pre_ui = 2;
+  static constexpr int team_size_compute_ui = 2;
+  static constexpr int tile_size_transform_ui = 2;
+  static constexpr int tile_size_compute_zi = 2;
+  static constexpr int min_blocks_compute_zi = 0; // no minimum bound
+  static constexpr int tile_size_compute_bi = 2;
+  static constexpr int tile_size_compute_yi = 2;
+  static constexpr int min_blocks_compute_yi = 0; // no minimum bound
+  static constexpr int team_size_compute_fused_deidrj = 2;
+#else
+  static constexpr int team_size_compute_neigh = 4;
+  static constexpr int tile_size_compute_ck = 4;
+  static constexpr int tile_size_pre_ui = 4;
+  static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
+  static constexpr int tile_size_transform_ui = 4;
+  static constexpr int tile_size_compute_zi = 8;
+  static constexpr int tile_size_compute_bi = 4;
+  static constexpr int tile_size_compute_yi = 8;
+  static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
+
+  // this empirically reduces perf fluctuations from compiler version to compiler version
+  static constexpr int min_blocks_compute_zi = 4;
+  static constexpr int min_blocks_compute_yi = 4;
+#endif
+
+  // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
+  // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
+  // and reduces the verbosity of the LaunchBound by hiding the explicit
+  // multiplication by vector_length
+  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
+  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles, min_blocks>, TagComputeSNA>;
+
+  // MDRangePolicy for the 3D grid loop:
+  template <class Device, class TagComputeSNA>
+  using CSNAGrid3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
+
+  // Testing out team policies
+  template <class Device, int num_teams,  class TagComputeSNA>
+  using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
+  //using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::IndexType<int>, Kokkos::IndexType<int>, Kokkos::IndexType<int>, TagComputeSNA>;
+  //using team_member = typename team_policy::member_type;
+
+  // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
+  // This hides the LaunchBounds abstraction by hiding the explicit
+  // multiplication by vector length
+  template <class Device, int num_teams, class TagComputeSNA>
+  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
+
+  // Helper routine that returns a CPU or a GPU policy as appropriate
+  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
+  auto snap_get_policy(const int& chunk_size_div, const int& second_loop) {
+    return Snap3DRangePolicy<Device, num_tiles, TagComputeSNA, min_blocks>({0, 0, 0},
+                                                                 {vector_length, second_loop, chunk_size_div},
+                                                                 {vector_length, num_tiles, 1});
+  }
+
+  ComputeSNAGridKokkos(class LAMMPS *, int, char **);
+  ~ComputeSNAGridKokkos() override;
+
+  void setup() override;
+  void compute_array() override;
+
+  // Utility functions for teams
+
+  template<class TagStyle>
+  void check_team_size_for(int, int&);
+
+  template<class TagStyle>
+  void check_team_size_reduce(int, int&);
+
+  // operator function for example team policy
+  //KOKKOS_INLINE_FUNCTION
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLoop, const int& ) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLoopCPU, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeNeigh>::member_type& team) const;
+
+  // 3D case - used by parallel_for
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridPreUi, const int& iatom, const int& j) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridPreUi, const int& iatom) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiSmall>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiLarge>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridTransformUi, const int& iatom, const int& idxu) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridTransformUi, const int& iatom) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom_mod, const int& idxz, const int& iatom_div) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom, const int& idxz) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom_mod, const int& idxb, const int& iatom_div) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom, const int& idxb) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalFill,const int& ii) const;
+
+ protected:
+
+  SNAKokkos<DeviceType, real_type, vector_length> snaKK;
+
+  int max_neighs, chunk_size, chunk_offset;
+  int host_flag;
+  int ntotal;
+  int total_range; // total number of loop iterations in grid
+  int zlen; //= nzhi-nzlo+1;
+  int ylen; //= nyhi-nylo+1;
+  int xlen; //= nxhi-nxlo+1;
+
+  double cutsq_tmp; // temporary cutsq until we get a view
+
+  Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
+  Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
+  Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
+  Kokkos::View<real_type*, DeviceType> d_sinnerelem;           // element inner cutoff midpoint
+  Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
+  Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
+  Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
+  Kokkos::View<real_type*, DeviceType> d_test;              // test view
+
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq;
+  typedef Kokkos::View<const F_FLOAT**, DeviceType,
+      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
+  t_fparams_rnd rnd_cutsq;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_int_1d_randomread type;
+  DAT::tdual_float_2d k_grid;
+  DAT::tdual_float_2d k_gridall;
+  typename AT::t_float_2d d_grid;
+  typename AT::t_float_2d d_gridall;
+
+  DAT::tdual_float_4d k_gridlocal;
+  typename AT::t_float_4d d_gridlocal;
+
+
+  // Utility routine which wraps computing per-team scratch size requirements for
+  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
+  template <typename scratch_type>
+  int scratch_size_helper(int values_per_team);
+
+  class DomainKokkos *domainKK;
+
+  // triclinic vars
+  double h0, h1, h2, h3, h4, h5;
+  double lo0, lo1, lo2;
+
+  // Make SNAKokkos a friend
+  friend class SNAKokkos<DeviceType, real_type, vector_length>;
+};
+
+// These wrapper classes exist to make the compute style factory happy/avoid having
+// to extend the compute  style factory to support Compute classes w/an arbitrary number
+// of extra template parameters
+
+template <class DeviceType>
+class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>;
+
+ public:
+
+  ComputeSNAGridKokkosDevice(class LAMMPS *, int, char **);
+
+  void compute_array() override;
+
+};
+
+#ifdef LMP_KOKKOS_GPU
+template <class DeviceType>
+class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>;
+
+ public:
+
+  ComputeSNAGridKokkosHost(class LAMMPS *, int, char **);
+
+  void compute_array() override;
+
+};
+#endif
+
+}
+
+#endif
+#endif
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@ -0,0 +1,786 @@
+// clang-format off
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Christian Trott (SNL), Stan Moore (SNL),
+                         Evan Weinberg (NVIDIA)
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_kokkos.h"
+#include "pair_snap_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include "domain.h"
+#include "domain_kokkos.h"
+#include "sna.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+
+#include <iostream>
+
+#define MAXLINE 1024
+#define MAXWORD 3
+
+namespace LAMMPS_NS {
+
+// Constructor
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  domainKK = (DomainKokkos *) domain;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
+  auto d_cutsq = k_cutsq.template view<DeviceType>();
+  rnd_cutsq = d_cutsq;
+
+  host_flag = (execution_space == Host);
+
+  // TODO: Extract cutsq in double loop below, no need for cutsq_tmp
+
+  cutsq_tmp = cutsq[1][1];
+
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = 1; j <= atom->ntypes; j++){
+      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp;
+      k_cutsq.template modify<LMPHostType>();
+    }
+  }
+
+   // Set up element lists
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
+  MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements);
+  MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements);
+  MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements);
+  // test
+  MemKK::realloc_kokkos(d_test, "ComputeSNAGridKokkos::test", nelements);
+
+  int n = atom->ntypes;
+  MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
+
+  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
+  auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
+  auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
+  auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+  // test
+  auto h_test = Kokkos::create_mirror_view(d_test);
+  h_test(0) = 2.0;
+
+  // start from index 1 because of how compute sna/grid is
+  for (int i = 1; i <= atom->ntypes; i++) {
+    h_radelem(i-1) = radelem[i];
+    h_wjelem(i-1) = wjelem[i];
+    if (switchinnerflag){
+      h_sinnerelem(i) = sinnerelem[i];
+      h_dinnerelem(i) = dinnerelem[i];
+    }
+  }
+
+  // In pair snap some things like `map` get allocated regardless of chem flag.
+  if (chemflag){
+    for (int i = 1; i <= atom->ntypes; i++) {
+      h_map(i) = map[i];
+    }
+  }
+
+  Kokkos::deep_copy(d_radelem,h_radelem);
+  Kokkos::deep_copy(d_wjelem,h_wjelem);
+  if (switchinnerflag){
+    Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
+    Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
+  }
+  if (chemflag){
+    Kokkos::deep_copy(d_map,h_map);
+  }
+  Kokkos::deep_copy(d_test,h_test);
+
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this);
+  snaKK.grow_rij(0,0);
+  snaKK.init();
+}
+
+// Destructor
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  memoryKK->destroy_kokkos(k_gridall, gridall);
+}
+
+// Setup
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
+{
+  // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
+  // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
+
+  ComputeGrid::set_grid_global();
+  ComputeGrid::set_grid_local();
+
+  // allocate arrays
+  memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
+
+  // do not use or allocate gridlocal for now
+
+  gridlocal_allocated = 0;
+  array = gridall;
+
+  d_gridlocal = k_gridlocal.template view<DeviceType>();
+  d_gridall = k_gridall.template view<DeviceType>();
+}
+
+// Compute
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
+{
+  if (host_flag) {
+    ComputeSNAGrid::compute_array();
+    return;
+  }
+
+  copymode = 1;
+
+  zlen = nzhi-nzlo+1;
+  ylen = nyhi-nylo+1;
+  xlen = nxhi-nxlo+1;
+  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  k_cutsq.template sync<DeviceType>();
+
+  // max_neighs is defined here - think of more elaborate methods.
+  max_neighs = 100;
+
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
+  // number of atoms.
+
+  ntotal = atomKK->nlocal + atomKK->nghost;
+  // Allocate view for number of neighbors per grid point
+  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
+
+  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
+  // `total_range` is the number of grid points which may be larger than chunk size.
+  chunk_size = MIN(chunksize, total_range);
+  chunk_offset = 0;
+  snaKK.grow_rij(chunk_size, max_neighs);
+
+  // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
+  const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
+
+  if (triclinic) {
+    h0 = domain->h[0];
+    h1 = domain->h[1];
+    h2 = domain->h[2];
+    h3 = domain->h[3];
+    h4 = domain->h[4];
+    h5 = domain->h[5];
+    lo0 = domain->boxlo[0];
+    lo1 = domain->boxlo[1];
+    lo2 = domain->boxlo[2];
+  }
+
+  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
+
+    if (chunk_size > total_range - chunk_offset)
+      chunk_size = total_range - chunk_offset;
+
+
+    //ComputeNeigh
+    {
+      int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
+
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh>
+        policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
+      policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
+    }
+
+    //ComputeCayleyKlein
+    {
+      // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
+      Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
+        policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
+      Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
+    }
+
+    //PreUi
+    {
+      auto policy_pre_ui = snap_get_policy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>(chunk_size_div, twojmax + 1);
+      Kokkos::parallel_for("PreUi", policy_pre_ui, *this);
+    }
+
+    // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
+    {
+      // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
+      // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+      const int tile_size = vector_length * (twojmax + 1);
+      const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
+
+      if (chunk_size < parallel_thresh)
+      {
+        // Version with parallelism over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
+        const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiSmall>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
+      } else {
+        // Version w/out parallelism over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal)
+        const int n_teams = chunk_size_div * max_neighs;
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiLarge>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
+      }
+    }
+
+    //TransformUi: un-"fold" ulisttot, zero ylist
+    {
+      // Expand ulisttot_re,_im -> ulisttot
+      // Zero out ylist
+      auto policy_transform_ui = snap_get_policy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>(chunk_size_div, snaKK.idxu_max);
+      Kokkos::parallel_for("TransformUi", policy_transform_ui, *this);
+    }
+
+    //Compute bispectrum
+    // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h`
+
+    //ComputeZi and Bi
+    if (nelements > 1) {
+      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi<true>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
+      Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this);
+
+      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi<true>>(chunk_size_div, snaKK.idxb_max);
+      Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this);
+    } else {
+      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi<false>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
+      Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this);
+
+      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi<false>>(chunk_size_div, snaKK.idxb_max);
+      Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this);
+    }
+
+    // Fill the grid array with bispectrum values
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocalFill> policy_fill(0,chunk_size);
+      Kokkos::parallel_for(policy_fill, *this);
+    }
+
+    // Proceed to the next chunk.
+    chunk_offset += chunk_size;
+
+  } // end while
+
+  copymode = 0;
+
+  k_gridlocal.template modify<DeviceType>();
+  k_gridlocal.template sync<LMPHostType>();
+
+  k_gridall.template modify<DeviceType>();
+  k_gridall.template sync<LMPHostType>();
+}
+
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the GPU codepath. These take advantage
+   of AoSoA data layouts and scratch memory for recursive polynomials
+------------------------------------------------------------------------- */
+
+/*
+ Simple team policy functor seeing how many layers deep we can go with the parallelism.
+ */
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
+
+  // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
+  // Main difference is that we don't use the neighbor class or neighbor variables here.
+  // This is because the grid points are not atoms and therefore do not get assigned
+  // neighbors in LAMMPS.
+  // TODO: If we did make a neighborlist for each grid point, we could use current
+  //       routines and avoid having to loop over all atoms (which limits us to
+  //       natoms = max team size).
+
+  // basic quantities associated with this team:
+  // team_rank : rank of thread in this team
+  // league_rank : rank of team in this league
+  // team_size : number of threads in this team
+
+  // extract loop index
+  int ii = team.team_rank() + team.league_rank() * team.team_size();
+
+  if (ii >= chunk_size) return;
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff.
+  // If it is, type_cache is assigned to the atom type.
+  // If it's not, it's assigned to -1.
+  //const int tile_size = ntotal; //max_neighs; // number of elements per thread
+  //const int team_rank = team.team_rank();
+  //const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  //int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  //int igrid = iz * (nx * ny) + iy * nx + ix;
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  // multiply grid integers by grid spacing delx, dely, delz
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+
+  // currently, all grid points are type 1
+  // not clear what a better choice would be
+
+  const int itype = 1;
+  int ielem = 0;
+  if (chemflag) ielem = d_map[itype];
+  //const double radi = d_radelem[ielem];
+
+  // Compute the number of neighbors, store rsq
+  int ninside = 0;
+
+  // Looping over ntotal for now.
+  for (int j = 0; j < ntotal; j++){
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    // don't include atoms that share location with grid point
+    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
+      jtype = -1; // use -1 to signal it's outside the radius
+    }
+
+    if (jtype >= 0)
+      ninside++;
+  }
+
+  d_ninside(ii) = ninside;
+
+  // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
+  int offset = 0;
+  for (int j = 0; j < ntotal; j++){
+    //const int jtype = type_cache[j];
+    //if (jtype >= 0) {
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+    int jtype = type(j);
+    if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) {
+      int jelem = 0;
+      if (chemflag) jelem = d_map[jtype];
+      snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
+      snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
+      snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
+      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
+      // actually since the views here have values starting at 0, let's use jelem
+      snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+      snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+      snaKK.inside(ii,offset) = j;
+      if (switchinnerflag) {
+        snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+        snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+      }
+      if (chemflag)
+        snaKK.element(ii,offset) = jelem;
+      else
+        snaKK.element(ii,offset) = 0;
+      offset++;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+  Pre-compute the Cayley-Klein parameters for reuse in later routines
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  const int ninside = d_ninside(iatom);
+  if (jnbor >= ninside) return;
+
+  snaKK.compute_cayley_klein(iatom, jnbor);
+}
+
+/* ----------------------------------------------------------------------
+  Initialize the "ulisttot" structure with non-zero on-diagonal terms
+  and zero terms elsewhere
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  int itype = type(iatom);
+  int ielem = d_map[itype];
+
+  snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom, const int& j) const {
+  if (iatom >= chunk_size) return;
+
+  int itype = type(iatom);
+  int ielem = d_map[itype];
+
+  snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+
+  const int itype = type(iatom);
+  const int ielem = d_map[itype];
+
+  for (int j = 0; j <= twojmax; j++)
+    snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiSmall>::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
+  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
+  const int jbend = jj_jbend / max_neighs;
+  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
+  });
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiLarge>::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - iatom_div * max_neighs;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
+  });
+}
+
+/* ----------------------------------------------------------------------
+  De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot
+  structure. Zero-initialize ylist. CPU and GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (idxu >= snaKK.idxu_max) return;
+  snaKK.transform_ui(iatom, idxu);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom, const int& idxu) const {
+  if (iatom >= chunk_size) return;
+  snaKK.transform_ui(iatom, idxu);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int idxu = 0; idxu < snaKK.idxu_max; idxu++)
+    snaKK.transform_ui(iatom, idxu);
+}
+
+/* ----------------------------------------------------------------------
+  Compute all elements of the Z tensor and store them into the `zlist`
+   view
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom_mod, const int& jjz, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjz >= snaKK.idxz_max) return;
+  snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom, const int& jjz) const {
+  if (iatom >= chunk_size) return;
+  snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int jjz = 0; jjz < snaKK.idxz_max; jjz++)
+    snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+/* ----------------------------------------------------------------------
+  Compute the energy triple products and store in the "blist" view
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom_mod, const int& jjb, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjb >= snaKK.idxb_max) return;
+  snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom, const int& jjb) const {
+  if (iatom >= chunk_size) return;
+  snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int jjb = 0; jjb < snaKK.idxb_max; jjb++)
+    snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill, const int& ii) const {
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  // int igrid = iz * (nx * ny) + iy * nx + ix;
+  // printf("ii igrid: %d %d\n", ii, igrid);
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+  d_gridall(igrid,0) = xtmp;
+  d_gridall(igrid,1) = ytmp;
+  d_gridall(igrid,2) = ztmp;
+
+  const auto idxb_max = snaKK.idxb_max;
+
+  // linear contributions
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+    const auto idxb = icoeff % idxb_max;
+    const auto idx_chem = icoeff / idxb_max;
+    d_gridall(igrid,icoeff+3) = snaKK.blist(ii,idx_chem,idxb);
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   utility functions
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<typename scratch_type>
+int ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::scratch_size_helper(int values_per_team) {
+  typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
+
+  return ScratchViewType::shmem_size(values_per_team);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   routines used by template reference classes
+------------------------------------------------------------------------- */
+
+
+template<class DeviceType>
+ComputeSNAGridKokkosDevice<DeviceType>::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridKokkosDevice<DeviceType>::compute_array()
+{
+  Base::compute_array();
+}
+
+#ifdef LMP_KOKKOS_GPU
+template<class DeviceType>
+ComputeSNAGridKokkosHost<DeviceType>::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridKokkosHost<DeviceType>::compute_array()
+{
+  Base::compute_array();
+}
+#endif
+
+}
--- a/src/KOKKOS/compute_sna_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.cpp
@ -0,0 +1,25 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_local_kokkos.h"
+#include "compute_sna_grid_local_kokkos_impl.h"
+
+namespace LAMMPS_NS {
+
+template class ComputeSNAGridLocalKokkosDevice<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeSNAGridLocalKokkosHost<LMPHostType>;
+#endif
+
+}
--- a/src/KOKKOS/compute_sna_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h
@ -0,0 +1,288 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(sna/grid/local/kk,ComputeSNAGridLocalKokkosDevice<LMPDeviceType>);
+ComputeStyle(sna/grid/local/kk/device,ComputeSNAGridLocalKokkosDevice<LMPDeviceType>);
+#ifdef LMP_KOKKOS_GPU
+ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosHost<LMPHostType>);
+#else
+ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosDevice<LMPHostType>);
+#endif
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_COMPUTE_SNA_GRID_LOCAL_KOKKOS_H
+#define LMP_COMPUTE_SNA_GRID_LOCAL_KOKKOS_H
+
+#include "compute_sna_grid_local.h"
+#include "kokkos_type.h"
+#include "sna_kokkos.h"
+
+namespace LAMMPS_NS {
+
+// Routines for both the CPU and GPU backend
+
+// GPU backend only
+struct TagCSNAGridLocalComputeNeigh{};
+struct TagCSNAGridLocalComputeCayleyKlein{};
+struct TagCSNAGridLocalPreUi{};
+struct TagCSNAGridLocalComputeUiSmall{}; // more parallelism, more divergence
+struct TagCSNAGridLocalComputeUiLarge{}; // less parallelism, no divergence
+struct TagCSNAGridLocalTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
+template <bool chemsnap> struct TagCSNAGridLocalComputeZi{};
+template <bool chemsnap> struct TagCSNAGridLocalComputeBi{};
+struct TagCSNAGridLocal2Fill{}; // fill the gridlocal array
+
+struct TagComputeSNAGridLocalLoop{};
+struct TagComputeSNAGridLocal3D{};
+
+// CPU backend only
+struct TagComputeSNAGridLocalLoopCPU{};
+
+//template<class DeviceType>
+template<class DeviceType, typename real_type_, int vector_length_>
+class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  static constexpr int vector_length = vector_length_;
+  using real_type = real_type_;
+  using complex = SNAComplex<real_type>;
+
+  // Static team/tile sizes for device offload
+
+#ifdef KOKKOS_ENABLE_HIP
+  static constexpr int team_size_compute_neigh = 2;
+  static constexpr int tile_size_compute_ck = 2;
+  static constexpr int tile_size_pre_ui = 2;
+  static constexpr int team_size_compute_ui = 2;
+  static constexpr int tile_size_transform_ui = 2;
+  static constexpr int tile_size_compute_zi = 2;
+  static constexpr int min_blocks_compute_zi = 0; // no minimum bound
+  static constexpr int tile_size_compute_bi = 2;
+  static constexpr int tile_size_compute_yi = 2;
+  static constexpr int min_blocks_compute_yi = 0; // no minimum bound
+  static constexpr int team_size_compute_fused_deidrj = 2;
+#else
+  static constexpr int team_size_compute_neigh = 4;
+  static constexpr int tile_size_compute_ck = 4;
+  static constexpr int tile_size_pre_ui = 4;
+  static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
+  static constexpr int tile_size_transform_ui = 4;
+  static constexpr int tile_size_compute_zi = 8;
+  static constexpr int tile_size_compute_bi = 4;
+  static constexpr int tile_size_compute_yi = 8;
+  static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
+
+  // this empirically reduces perf fluctuations from compiler version to compiler version
+  static constexpr int min_blocks_compute_zi = 4;
+  static constexpr int min_blocks_compute_yi = 4;
+#endif
+
+  // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
+  // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
+  // and reduces the verbosity of the LaunchBound by hiding the explicit
+  // multiplication by vector_length
+  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
+  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles, min_blocks>, TagComputeSNA>;
+
+  // MDRangePolicy for the 3D grid loop:
+  template <class Device, class TagComputeSNA>
+  using CSNAGridLocal3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
+
+  // Testing out team policies
+  template <class Device, int num_teams,  class TagComputeSNA>
+  using CSNAGridLocalTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
+
+  // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
+  // This hides the LaunchBounds abstraction by hiding the explicit
+  // multiplication by vector length
+  template <class Device, int num_teams, class TagComputeSNA>
+  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
+
+  // Helper routine that returns a CPU or a GPU policy as appropriate
+  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
+  auto snap_get_policy(const int& chunk_size_div, const int& second_loop) {
+    return Snap3DRangePolicy<Device, num_tiles, TagComputeSNA, min_blocks>({0, 0, 0},
+                                                                 {vector_length, second_loop, chunk_size_div},
+                                                                 {vector_length, num_tiles, 1});
+  }
+
+  ComputeSNAGridLocalKokkos(class LAMMPS *, int, char **);
+  ~ComputeSNAGridLocalKokkos() override;
+
+  void setup() override;
+  void compute_local() override;
+
+  // Utility functions for teams
+
+  template<class TagStyle>
+  void check_team_size_for(int, int&);
+
+  template<class TagStyle>
+  void check_team_size_reduce(int, int&);
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLocalLoop, const int& ) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLocalLoopCPU, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeNeigh>::member_type& team) const;
+
+  // 3D case - used by parallel_for
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeSNAGridLocal3D, const int& iz, const int& iy, const int& ix) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalPreUi, const int& iatom, const int& j) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalPreUi, const int& iatom) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeUiSmall>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeUiLarge>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalTransformUi, const int& iatom, const int& idxu) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalTransformUi, const int& iatom) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom_mod, const int& idxz, const int& iatom_div) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom, const int& idxz) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom_mod, const int& idxb, const int& iatom_div) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom, const int& idxb) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocal2Fill,const int& ii) const;
+
+ protected:
+
+  SNAKokkos<DeviceType, real_type, vector_length> snaKK;
+
+  int max_neighs, chunk_size, chunk_offset;
+  int host_flag;
+  int ntotal;
+  int total_range; // total number of loop iterations in grid
+  int zlen; //= nzhi-nzlo+1;
+  int ylen; //= nyhi-nylo+1;
+  int xlen; //= nxhi-nxlo+1;
+
+  double cutsq_tmp; // temporary cutsq until we get a view
+
+  Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
+  Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
+  Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
+  Kokkos::View<real_type*, DeviceType> d_sinnerelem;           // element inner cutoff midpoint
+  Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
+  Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
+  Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
+  Kokkos::View<real_type*, DeviceType> d_test;              // test view
+
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq;
+  typedef Kokkos::View<const F_FLOAT**, DeviceType,
+      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
+  t_fparams_rnd rnd_cutsq;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_int_1d_randomread type;
+
+  DAT::tdual_float_2d k_alocal;
+  typename AT::t_float_2d d_alocal;
+
+
+  // Utility routine which wraps computing per-team scratch size requirements for
+  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
+  template <typename scratch_type>
+  int scratch_size_helper(int values_per_team);
+
+  class DomainKokkos *domainKK;
+
+  // triclinic vars
+  double h0, h1, h2, h3, h4, h5;
+  double lo0, lo1, lo2;
+
+  // Make SNAKokkos a friend
+  friend class SNAKokkos<DeviceType, real_type, vector_length>;
+};
+
+// These wrapper classes exist to make the compute style factory happy/avoid having
+// to extend the compute  style factory to support Compute classes w/an arbitrary number
+// of extra template parameters
+
+template <class DeviceType>
+class ComputeSNAGridLocalKokkosDevice : public ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>;
+
+ public:
+
+  ComputeSNAGridLocalKokkosDevice(class LAMMPS *, int, char **);
+
+  void compute_local() override;
+
+};
+
+#ifdef LMP_KOKKOS_GPU
+template <class DeviceType>
+class ComputeSNAGridLocalKokkosHost : public ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>;
+
+ public:
+
+  ComputeSNAGridLocalKokkosHost(class LAMMPS *, int, char **);
+
+  void compute_local() override;
+
+};
+#endif
+
+}
+
+#endif
+#endif
--- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
@ -0,0 +1,783 @@
+// clang-format off
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Andrew Rohskopf (SNL)
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_local_kokkos.h"
+#include "pair_snap_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include "domain.h"
+#include "domain_kokkos.h"
+#include "sna.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+
+#include <iostream>
+
+#define MAXLINE 1024
+#define MAXWORD 3
+
+namespace LAMMPS_NS {
+
+// Constructor
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridLocalKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGridLocal(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  domainKK = (DomainKokkos *) domain;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  k_cutsq = tdual_fparams("ComputeSNAGridLocalKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
+  auto d_cutsq = k_cutsq.template view<DeviceType>();
+  rnd_cutsq = d_cutsq;
+
+  host_flag = (execution_space == Host);
+
+  // TODO: Extract cutsq in double loop below, no need for cutsq_tmp
+
+  cutsq_tmp = cutsq[1][1];
+
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = 1; j <= atom->ntypes; j++){
+      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp;
+      k_cutsq.template modify<LMPHostType>();
+    }
+  }
+
+   // Set up element lists
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridLocalKokkos::radelem",nelements);
+  MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridLocalKokkos:wjelem",nelements);
+  MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridLocalKokkos:sinnerelem",nelements);
+  MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridLocalKokkos:dinnerelem",nelements);
+  // test
+  MemKK::realloc_kokkos(d_test, "ComputeSNAGridLocalKokkos::test", nelements);
+
+  int n = atom->ntypes;
+  MemKK::realloc_kokkos(d_map,"ComputeSNAGridLocalKokkos::map",n+1);
+
+  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
+  auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
+  auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
+  auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+  // test
+  auto h_test = Kokkos::create_mirror_view(d_test);
+  h_test(0) = 2.0;
+
+  // start from index 1 because of how compute sna/grid is
+  for (int i = 1; i <= atom->ntypes; i++) {
+    h_radelem(i-1) = radelem[i];
+    h_wjelem(i-1) = wjelem[i];
+    if (switchinnerflag){
+      h_sinnerelem(i) = sinnerelem[i];
+      h_dinnerelem(i) = dinnerelem[i];
+    }
+  }
+
+  // In pair snap some things like `map` get allocated regardless of chem flag.
+  if (chemflag){
+    for (int i = 1; i <= atom->ntypes; i++) {
+      h_map(i) = map[i];
+    }
+  }
+
+  Kokkos::deep_copy(d_radelem,h_radelem);
+  Kokkos::deep_copy(d_wjelem,h_wjelem);
+  if (switchinnerflag){
+    Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
+    Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
+  }
+  if (chemflag){
+    Kokkos::deep_copy(d_map,h_map);
+  }
+  Kokkos::deep_copy(d_test,h_test);
+
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this);
+  snaKK.grow_rij(0,0);
+  snaKK.init();
+}
+
+// Destructor
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridLocalKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  memoryKK->destroy_kokkos(k_alocal,alocal);
+}
+
+// Setup
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::setup()
+{
+
+  ComputeGridLocal::setup();
+
+  // allocate arrays
+  memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal");
+  array_local = alocal;
+  d_alocal = k_alocal.template view<DeviceType>();
+}
+
+// Compute
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_local()
+{
+  if (host_flag) {
+    ComputeSNAGridLocal::compute_array();
+    return;
+  }
+
+  copymode = 1;
+
+  zlen = nzhi-nzlo+1;
+  ylen = nyhi-nylo+1;
+  xlen = nxhi-nxlo+1;
+  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  k_cutsq.template sync<DeviceType>();
+
+  // max_neighs is defined here - think of more elaborate methods.
+  max_neighs = 100;
+
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
+  // number of atoms.
+
+  ntotal = atomKK->nlocal + atomKK->nghost;
+  // Allocate view for number of neighbors per grid point
+  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridLocalKokkos:ninside",total_range);
+
+  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
+  // `total_range` is the number of grid points which may be larger than chunk size.
+  chunk_size = MIN(chunksize, total_range);
+  chunk_offset = 0;
+  //snaKK.grow_rij(chunk_size, ntotal);
+  snaKK.grow_rij(chunk_size, max_neighs);
+
+  //chunk_size = total_range;
+
+  // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
+  const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
+
+  if (triclinic) {
+    h0 = domain->h[0];
+    h1 = domain->h[1];
+    h2 = domain->h[2];
+    h3 = domain->h[3];
+    h4 = domain->h[4];
+    h5 = domain->h[5];
+    lo0 = domain->boxlo[0];
+    lo1 = domain->boxlo[1];
+    lo2 = domain->boxlo[2];
+  }
+
+  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
+
+    if (chunk_size > total_range - chunk_offset)
+      chunk_size = total_range - chunk_offset;
+
+
+    //ComputeNeigh
+    {
+      int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
+
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridLocalComputeNeigh>
+        policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
+      policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
+    }
+
+    //ComputeCayleyKlein
+    {
+      // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
+      Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridLocalComputeCayleyKlein>
+        policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
+      Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
+    }
+
+    //PreUi
+    {
+      auto policy_pre_ui = snap_get_policy<DeviceType, tile_size_pre_ui, TagCSNAGridLocalPreUi>(chunk_size_div, twojmax + 1);
+      Kokkos::parallel_for("PreUi", policy_pre_ui, *this);
+    }
+
+    // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
+    {
+      // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
+      // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+      const int tile_size = vector_length * (twojmax + 1);
+      const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
+
+      if (chunk_size < parallel_thresh)
+      {
+        // Version with parallelism over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
+        const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridLocalComputeUiSmall>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
+      } else {
+        // Version w/out parallelism over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal)
+        const int n_teams = chunk_size_div * max_neighs;
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridLocalComputeUiLarge>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
+      }
+    }
+
+    //TransformUi: un-"fold" ulisttot, zero ylist
+    {
+      // Expand ulisttot_re,_im -> ulisttot
+      // Zero out ylist
+      auto policy_transform_ui = snap_get_policy<DeviceType, tile_size_transform_ui, TagCSNAGridLocalTransformUi>(chunk_size_div, snaKK.idxu_max);
+      Kokkos::parallel_for("TransformUi", policy_transform_ui, *this);
+    }
+
+    //Compute bispectrum
+    // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h`
+
+    //ComputeZi and Bi
+    if (nelements > 1) {
+      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridLocalComputeZi<true>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
+      Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this);
+
+      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridLocalComputeBi<true>>(chunk_size_div, snaKK.idxb_max);
+      Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this);
+    } else {
+      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridLocalComputeZi<false>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
+      Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this);
+
+      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridLocalComputeBi<false>>(chunk_size_div, snaKK.idxb_max);
+      Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this);
+    }
+
+    // Fill the grid array with bispectrum values
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocal2Fill> policy_fill(0,chunk_size);
+      Kokkos::parallel_for(policy_fill, *this);
+    }
+
+    // Proceed to the next chunk.
+    chunk_offset += chunk_size;
+
+  } // end while
+
+  copymode = 0;
+
+  k_alocal.template modify<DeviceType>();
+  k_alocal.template sync<LMPHostType>();
+}
+
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the GPU codepath. These take advantage
+   of AoSoA data layouts and scratch memory for recursive polynomials
+------------------------------------------------------------------------- */
+
+/*
+ Simple team policy functor seeing how many layers deep we can go with the parallelism.
+ */
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeNeigh>::member_type& team) const {
+
+  // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
+  // Main difference is that we don't use the neighbor class or neighbor variables here.
+  // This is because the grid points are not atoms and therefore do not get assigned
+  // neighbors in LAMMPS.
+  // TODO: If we did make a neighborlist for each grid point, we could use current
+  //       routines and avoid having to loop over all atoms (which limits us to
+  //       natoms = max team size).
+
+  // basic quantities associated with this team:
+  // team_rank : rank of thread in this team
+  // league_rank : rank of team in this league
+  // team_size : number of threads in this team
+
+  // extract loop index
+  int ii = team.team_rank() + team.league_rank() * team.team_size();
+
+  if (ii >= chunk_size) return;
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff.
+  // If it is, type_cache is assigned to the atom type.
+  // If it's not, it's assigned to -1.
+  //const int tile_size = ntotal; //max_neighs; // number of elements per thread
+  //const int team_rank = team.team_rank();
+  //const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  //int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  //int igrid = iz * (nx * ny) + iy * nx + ix;
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  // multiply grid integers by grid spacing delx, dely, delz
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+
+  // Zeroing out the components, which are filled as a sum.
+  for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
+    d_alocal(igrid, icol) = 0.0;
+  }
+
+  // Fill grid info columns
+  d_alocal(igrid, 0) = ix;
+  d_alocal(igrid, 1) = iy;
+  d_alocal(igrid, 2) = iz;
+  d_alocal(igrid, 3) = xtmp;
+  d_alocal(igrid, 4) = ytmp;
+  d_alocal(igrid, 5) = ztmp;
+
+  // currently, all grid points are type 1
+  // not clear what a better choice would be
+
+  const int itype = 1;
+  int ielem = 0;
+  if (chemflag) ielem = d_map[itype];
+  //const double radi = d_radelem[ielem];
+
+  // Compute the number of neighbors, store rsq
+  int ninside = 0;
+
+  // Looping over ntotal for now.
+  for (int j = 0; j < ntotal; j++){
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    // don't include atoms that share location with grid point
+    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
+      jtype = -1; // use -1 to signal it's outside the radius
+    }
+
+    if (jtype >= 0)
+      ninside++;
+  }
+
+  d_ninside(ii) = ninside;
+
+  // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
+  int offset = 0;
+  for (int j = 0; j < ntotal; j++){
+    //const int jtype = type_cache[j];
+    //if (jtype >= 0) {
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+    int jtype = type(j);
+    if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) {
+      int jelem = 0;
+      if (chemflag) jelem = d_map[jtype];
+      snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
+      snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
+      snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
+      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
+      // actually since the views here have values starting at 0, let's use jelem
+      snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+      snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+      snaKK.inside(ii,offset) = j;
+      if (switchinnerflag) {
+        snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+        snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+      }
+      if (chemflag)
+        snaKK.element(ii,offset) = jelem;
+      else
+        snaKK.element(ii,offset) = 0;
+      offset++;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+  Pre-compute the Cayley-Klein parameters for reuse in later routines
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  const int ninside = d_ninside(iatom);
+  if (jnbor >= ninside) return;
+
+  snaKK.compute_cayley_klein(iatom, jnbor);
+}
+
+/* ----------------------------------------------------------------------
+  Initialize the "ulisttot" structure with non-zero on-diagonal terms
+  and zero terms elsewhere
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  int itype = type(iatom);
+  int ielem = d_map[itype];
+
+  snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int& iatom, const int& j) const {
+  if (iatom >= chunk_size) return;
+
+  int itype = type(iatom);
+  int ielem = d_map[itype];
+
+  snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+
+  const int itype = type(iatom);
+  const int ielem = d_map[itype];
+
+  for (int j = 0; j <= twojmax; j++)
+    snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeUiSmall>::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
+  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
+  const int jbend = jj_jbend / max_neighs;
+  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
+  });
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeUiLarge>::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - iatom_div * max_neighs;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
+  });
+}
+
+/* ----------------------------------------------------------------------
+  De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot
+  structure. Zero-initialize ylist. CPU and GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (idxu >= snaKK.idxu_max) return;
+  snaKK.transform_ui(iatom, idxu);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi, const int& iatom, const int& idxu) const {
+  if (iatom >= chunk_size) return;
+  snaKK.transform_ui(iatom, idxu);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int idxu = 0; idxu < snaKK.idxu_max; idxu++)
+    snaKK.transform_ui(iatom, idxu);
+}
+
+/* ----------------------------------------------------------------------
+  Compute all elements of the Z tensor and store them into the `zlist`
+   view
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom_mod, const int& jjz, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjz >= snaKK.idxz_max) return;
+  snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom, const int& jjz) const {
+  if (iatom >= chunk_size) return;
+  snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int jjz = 0; jjz < snaKK.idxz_max; jjz++)
+    snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+/* ----------------------------------------------------------------------
+  Compute the energy triple products and store in the "blist" view
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom_mod, const int& jjb, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjb >= snaKK.idxb_max) return;
+  snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom, const int& jjb) const {
+  if (iatom >= chunk_size) return;
+  snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int jjb = 0; jjb < snaKK.idxb_max; jjb++)
+    snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocal2Fill, const int& ii) const {
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  // int igrid = iz * (nx * ny) + iy * nx + ix;
+  // printf("ii igrid: %d %d\n", ii, igrid);
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+
+  const auto idxb_max = snaKK.idxb_max;
+
+  // linear contributions
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+    const auto idxb = icoeff % idxb_max;
+    const auto idx_chem = icoeff / idxb_max;
+    d_alocal(igrid,icoeff+6) = snaKK.blist(ii,idx_chem,idxb);
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   utility functions
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<typename scratch_type>
+int ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::scratch_size_helper(int values_per_team) {
+  typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
+
+  return ScratchViewType::shmem_size(values_per_team);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   routines used by template reference classes
+------------------------------------------------------------------------- */
+
+
+template<class DeviceType>
+ComputeSNAGridLocalKokkosDevice<DeviceType>::ComputeSNAGridLocalKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridLocalKokkosDevice<DeviceType>::compute_local()
+{
+  Base::compute_local();
+}
+
+#ifdef LMP_KOKKOS_GPU
+template<class DeviceType>
+ComputeSNAGridLocalKokkosHost<DeviceType>::ComputeSNAGridLocalKokkosHost(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridLocalKokkosHost<DeviceType>::compute_local()
+{
+  Base::compute_local();
+}
+#endif
+
+}
--- a/src/KOKKOS/fix_acks2_reaxff_kokkos.cpp
+++ b/src/KOKKOS/fix_acks2_reaxff_kokkos.cpp
@ -225,7 +225,8 @@ void FixACKS2ReaxFFKokkos<DeviceType>::pre_force(int /*vflag*/)

  allocate_array();

-  if (!allocated_flag || last_allocate < neighbor->lastcall) {
+  if (!allocated_flag || last_allocate < neighbor->lastcall
+      || nlocal_last_allocate != nlocal) {

    // get max number of neighbor

@ -281,6 +282,7 @@ void FixACKS2ReaxFFKokkos<DeviceType>::pre_force(int /*vflag*/)
    prev_last_rows_rank = last_rows_rank;

    last_allocate = update->ntimestep;
+    nlocal_last_allocate = nlocal;
  }

  // compute_H
@ -430,8 +432,6 @@ void FixACKS2ReaxFFKokkos<DeviceType>::num_neigh_item(int ii, bigint &totneigh)
 template<class DeviceType>
 void FixACKS2ReaxFFKokkos<DeviceType>::allocate_matrix()
 {
-  nmax = atom->nmax;
-
  // determine the total space for the H matrix

  m_cap_big = 0;
@ -456,15 +456,15 @@ void FixACKS2ReaxFFKokkos<DeviceType>::allocate_matrix()

  // H matrix

-  d_firstnbr = typename AT::t_bigint_1d("acks2/kk:firstnbr",nmax);
-  d_numnbrs = typename AT::t_int_1d("acks2/kk:numnbrs",nmax);
+  d_firstnbr = typename AT::t_bigint_1d("acks2/kk:firstnbr",nlocal);
+  d_numnbrs = typename AT::t_int_1d("acks2/kk:numnbrs",nlocal);
  d_jlist = typename AT::t_int_1d("acks2/kk:jlist",m_cap_big);
  d_val = typename AT::t_ffloat_1d("acks2/kk:val",m_cap_big);

  // X matrix

-  d_firstnbr_X = typename AT::t_bigint_1d("acks2/kk:firstnbr_X",nmax);
-  d_numnbrs_X = typename AT::t_int_1d("acks2/kk:numnbrs_X",nmax);
+  d_firstnbr_X = typename AT::t_bigint_1d("acks2/kk:firstnbr_X",nlocal);
+  d_numnbrs_X = typename AT::t_int_1d("acks2/kk:numnbrs_X",nlocal);
  d_jlist_X = typename AT::t_int_1d("acks2/kk:jlist_X",m_cap_big);
  d_val_X = typename AT::t_ffloat_1d("acks2/kk:val_X",m_cap_big);
 }
--- a/src/KOKKOS/fix_acks2_reaxff_kokkos.h
+++ b/src/KOKKOS/fix_acks2_reaxff_kokkos.h
@ -243,7 +243,7 @@ class FixACKS2ReaxFFKokkos : public FixACKS2ReaxFF, public KokkosBase {
  void calculate_Q() override;

  int neighflag;
-  int nlocal,nall,nmax,newton_pair;
+  int nlocal,nlocal_last_allocate,nall,nmax,newton_pair;
  int count, isuccess;
  double alpha, beta, omega, cutsq;

--- a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp
+++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp
@ -215,9 +215,11 @@ void FixQEqReaxFFKokkos<DeviceType>::pre_force(int /*vflag*/)

  // get max number of neighbor

-  if (!allocated_flag || last_allocate < neighbor->lastcall) {
+  if (!allocated_flag || last_allocate < neighbor->lastcall
+      || nlocal_last_allocate != nlocal) {
    allocate_matrix();
    last_allocate = update->ntimestep;
+    nlocal_last_allocate = nlocal;
  }

  // compute_H
@ -313,8 +315,6 @@ void FixQEqReaxFFKokkos<DeviceType>::num_neigh_item(int ii, bigint &totneigh) co
 template<class DeviceType>
 void FixQEqReaxFFKokkos<DeviceType>::allocate_matrix()
 {
-  nmax = atom->nmax;
-
  // determine the total space for the H matrix

  m_cap_big = 0;
@ -332,8 +332,8 @@ void FixQEqReaxFFKokkos<DeviceType>::allocate_matrix()
  d_jlist = typename AT::t_int_1d();
  d_val = typename AT::t_ffloat_1d();

-  d_firstnbr = typename AT::t_bigint_1d("qeq/kk:firstnbr",nmax);
-  d_numnbrs = typename AT::t_int_1d("qeq/kk:numnbrs",nmax);
+  d_firstnbr = typename AT::t_bigint_1d("qeq/kk:firstnbr",nlocal);
+  d_numnbrs = typename AT::t_int_1d("qeq/kk:numnbrs",nlocal);
  d_jlist = typename AT::t_int_1d("qeq/kk:jlist",m_cap_big);
  d_val = typename AT::t_ffloat_1d("qeq/kk:val",m_cap_big);
 }
--- a/Show More
+++ b/Show More