diff --git a/cmake/Modules/Packages/ML-PACE.cmake b/cmake/Modules/Packages/ML-PACE.cmake
index 30aa433a58..ce8f02f5f4 100644
--- a/cmake/Modules/Packages/ML-PACE.cmake
+++ b/cmake/Modules/Packages/ML-PACE.cmake
@@ -1,6 +1,6 @@
-set(PACELIB_URL "https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/v.2023.10.04.pre.tar.gz" CACHE STRING "URL for PACE evaluator library sources")
+set(PACELIB_URL "https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/v.2023.10.04.tar.gz" CACHE STRING "URL for PACE evaluator library sources")
 
-set(PACELIB_MD5 "61ba11a37ee00de8365b18b521d394a6" CACHE STRING "MD5 checksum of PACE evaluator library tarball")
+set(PACELIB_MD5 "70ff79f4e59af175e55d24f3243ad1ff" CACHE STRING "MD5 checksum of PACE evaluator library tarball")
 mark_as_advanced(PACELIB_URL)
 mark_as_advanced(PACELIB_MD5)
 GetFallbackURL(PACELIB_URL PACELIB_FALLBACK)
diff --git a/doc/src/Commands_fix.rst b/doc/src/Commands_fix.rst
index 1d0af03f02..7301d1345e 100644
--- a/doc/src/Commands_fix.rst
+++ b/doc/src/Commands_fix.rst
@@ -69,7 +69,7 @@ OPT.
    * :doc:`drude/transform/inverse <fix_drude_transform>`
    * :doc:`dt/reset (k) <fix_dt_reset>`
    * :doc:`edpd/source <fix_dpd_source>`
-   * :doc:`efield <fix_efield>`
+   * :doc:`efield (k) <fix_efield>`
    * :doc:`efield/tip4p <fix_efield>`
    * :doc:`ehex <fix_ehex>`
    * :doc:`electrode/conp (i) <fix_electrode>`
@@ -233,7 +233,7 @@ OPT.
    * :doc:`spring <fix_spring>`
    * :doc:`spring/chunk <fix_spring_chunk>`
    * :doc:`spring/rg <fix_spring_rg>`
-   * :doc:`spring/self <fix_spring_self>`
+   * :doc:`spring/self (k) <fix_spring_self>`
    * :doc:`srd <fix_srd>`
    * :doc:`store/force <fix_store_force>`
    * :doc:`store/state <fix_store_state>`
diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst
index 0d54913bd7..923c040aaf 100644
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@@ -305,5 +305,5 @@ OPT.
    * :doc:`wf/cut <pair_wf_cut>`
    * :doc:`ylz <pair_ylz>`
    * :doc:`yukawa (gko) <pair_yukawa>`
-   * :doc:`yukawa/colloid (go) <pair_yukawa_colloid>`
+   * :doc:`yukawa/colloid (gko) <pair_yukawa_colloid>`
    * :doc:`zbl (gko) <pair_zbl>`
diff --git a/doc/src/Howto_output.rst b/doc/src/Howto_output.rst
index 851b7703fd..6fcd36ab56 100644
--- a/doc/src/Howto_output.rst
+++ b/doc/src/Howto_output.rst
@@ -1,7 +1,7 @@
 Output from LAMMPS (thermo, dumps, computes, fixes, variables)
 ==============================================================
 
-There are four basic kinds of LAMMPS output:
+There are four basic forms of LAMMPS output:
 
 * :doc:`Thermodynamic output <thermo_style>`, which is a list of
   quantities printed every few timesteps to the screen and logfile.
@@ -20,18 +20,17 @@ output files, depending on what :doc:`dump <dump>` and :doc:`fix <fix>`
 commands you specify.
 
 As discussed below, LAMMPS gives you a variety of ways to determine
-what quantities are computed and printed when the thermodynamics,
+what quantities are calculated and printed when the thermodynamics,
 dump, or fix commands listed above perform output.  Throughout this
 discussion, note that users can also :doc:`add their own computes and
-fixes to LAMMPS <Modify>` which can then generate values that can then
-be output with these commands.
+fixes to LAMMPS <Modify>` which can generate values that can then be
+output with these commands.
 
 The following subsections discuss different LAMMPS commands related
 to output and the kind of data they operate on and produce:
 
 * :ref:`Global/per-atom/local/per-grid data <global>`
 * :ref:`Scalar/vector/array data <scalar>`
-* :ref:`Per-grid data <grid>`
 * :ref:`Disambiguation <disambiguation>`
 * :ref:`Thermodynamic output <thermo>`
 * :ref:`Dump file output <dump>`
@@ -48,34 +47,65 @@ to output and the kind of data they operate on and produce:
 Global/per-atom/local/per-grid data
 -----------------------------------
 
-Various output-related commands work with four different styles of
+Various output-related commands work with four different "styles" of
 data: global, per-atom, local, and per-grid.  A global datum is one or
 more system-wide values, e.g. the temperature of the system.  A
 per-atom datum is one or more values per atom, e.g. the kinetic energy
 of each atom.  Local datums are calculated by each processor based on
-the atoms it owns, but there may be zero or more per atom, e.g. a list
+the atoms it owns, and there may be zero or more per atom, e.g. a list
 of bond distances.
 
 A per-grid datum is one or more values per grid cell, for a grid which
-overlays the simulation domain.  The grid cells and the data they
-store are distributed across processors; each processor owns the grid
-cells whose center point falls within its subdomain.
+overlays the simulation domain.  Similar to atoms and per-atom data,
+the grid cells and the data they store are distributed across
+processors; each processor owns the grid cells whose center points
+fall within its subdomain.
 
 .. _scalar:
 
 Scalar/vector/array data
 ------------------------
 
-Global, per-atom, and local datums can come in three kinds: a single
-scalar value, a vector of values, or a 2d array of values.  The doc
-page for a "compute" or "fix" or "variable" that generates data will
-specify both the style and kind of data it produces, e.g. a per-atom
-vector.
+Global, per-atom, local, and per-grid datums can come in three
+"kinds": a single scalar value, a vector of values, or a 2d array of
+values.  More specifically these are the valid kinds for each style:
 
-When a quantity is accessed, as in many of the output commands
-discussed below, it can be referenced via the following bracket
-notation, where ID in this case is the ID of a compute.  The leading
-"c\_" would be replaced by "f\_" for a fix, or "v\_" for a variable:
+* global scalar
+* global vector
+* global array
+* per-atom vector
+* per-atom array
+* local vector
+* local array
+* per-grid vector
+* per-grid array
+
+A per-atom vector means a single value per atom; the "vector" is the
+length of the number of atoms.  A per-atom array means multiple values
+per atom.  Similarly a local vector or array means one or multiple
+values per entity (e.g. per bond in the system).  And a per-grid
+vector or array means one or multiple values per grid cell.
+
+The doc page for a compute or fix or variable that generates data will
+specify both the styles and kinds of data it produces, e.g. a per-atom
+vector.  Note that a compute or fix may generate multiple styles and
+kinds of output.  However, for per-atom data only a vector or array is
+output, never both.  Likewise for per-local and per-grid data.  An
+example of a fix which generates multiple styles and kinds of data is
+the :doc:`fix mdi/qm <fix_mdi_qm>` command.  It outputs a global
+scalar, global vector, and per-atom array for the quantum mechanical
+energy and virial of the system and forces on each atom.
+
+By contrast, different variable styles generate only a single kind of
+data: a global scalar for an equal-style variable, global vector for a
+vector-style variable, and a per-atom vector for an atom-style
+variable.
+
+When data is accessed by another command, as in many of the output
+commands discussed below, it can be referenced via the following
+bracket notation, where ID in this case is the ID of a compute.  The
+leading "c\_" would be replaced by "f\_" for a fix, or "v\_" for a
+variable (and ID would be the name of the variable):
 
 +-------------+--------------------------------------------+
 | c_ID        | entire scalar, vector, or array            |
@@ -85,40 +115,56 @@ notation, where ID in this case is the ID of a compute.  The leading
 | c_ID[I][J]  | one element of array                       |
 +-------------+--------------------------------------------+
 
-In other words, using one bracket reduces the dimension of the data
-once (vector -> scalar, array -> vector).  Using two brackets reduces
-the dimension twice (array -> scalar).  Thus a command that uses
-scalar values as input can typically also process elements of a vector
-or array.
+Note that using one bracket reduces the dimension of the data once
+(vector -> scalar, array -> vector).  Using two brackets reduces the
+dimension twice (array -> scalar).  Thus a command that uses scalar
+values as input can also conceptually operate on an element of a
+vector or array.
 
-.. _grid:
-
-Per-grid data
-------------------------
-
-Per-grid data can come in two kinds: a vector of values (one per grid
-cekk), or a 2d array of values (multiple values per grid ckk).  The
-doc page for a "compute" or "fix" that generates data will specify
-names for both the grid(s) and datum(s) it produces, e.g. per-grid
-vectors or arrays, which can be referenced by other commands.  See the
-:doc:`Howto grid <Howto_grid>` doc page for more details.
+Per-grid vectors or arrays are accessed similarly, except that the ID
+for the compute or fix includes a grid name and a data name.  This is
+because a fix or compute can create multiple grids (of different
+sizes) and multiple sets of data (for each grid).  The fix or compute
+defines names for each grid and for each data set, so that all of them
+can be accessed by other commands.  See the :doc:`Howto grid
+<Howto_grid>` doc page for more details.
 
 .. _disambiguation:
 
 Disambiguation
 --------------
 
-Some computes and fixes produce data in multiple styles, e.g. a global
-scalar and a per-atom vector. Usually the context in which the input
-script references the data determines which style is meant. Example:
-if a compute provides both a global scalar and a per-atom vector, the
-former will be accessed by using ``c_ID`` in an equal-style variable,
-while the latter will be accessed by using ``c_ID`` in an atom-style
-variable.  Note that atom-style variable formulas can also access
-global scalars, but in this case it is not possible to do this
-directly because of the ambiguity.  Instead, an equal-style variable
-can be defined which accesses the global scalar, and that variable can
-be used in the atom-style variable formula in place of ``c_ID``.
+When a compute or fix produces data in multiple styles, e.g. global
+and per-atom, a reference to the data can sometimes be ambiguous.
+Usually the context in which the input script references the data
+determines which style is meant.
+
+For example, if a compute outputs a global vector and a per-atom
+array, an element of the global vector will be accessed by using
+``c_ID[I]`` in :doc:`thermodynamic output <thermo_style>`, while a
+column of the per-atom array will be accessed by using ``c_ID[I]`` in
+a :doc:`dump custom <dump>` command.
+
+However, if a :doc:`atom-style variable <variable>` references
+``c_ID[I]``, then it could be intended to refer to a single element of
+the global vector or a column of the per-atom array.  The doc page for
+any command that has a potential ambiguity (variables are the most
+common) will explain how to resolve the ambiguity.
+
+In this case, an atom-style variables references per-atom data if it
+exists.  If access to an element of a global vector is needed (as in
+this example), an equal-style variable which references the value can
+be defined and used in the atom-style variable formula instead.
+
+Similarly, :doc:`thermodynamic output <thermo_style>` can only
+reference global data from a compute or fix.  But you can indirectly
+access per-atom data as follows.  The reference ``c_ID[245][2]`` for
+the ID of a :doc:`compute displace/atom <compute_displace_atom>`
+command, refers to the y-component of displacement for the atom with
+ID 245.  While you cannot use that reference directly in the
+:doc:`thermo_style <thermo_style>` command, you can use it an
+equal-style variable formula, and then reference the variable in
+thermodynamic output.
 
 .. _thermo:
 
@@ -389,7 +435,7 @@ output and input data types must match, e.g. global/per-atom/local
 data and scalar/vector/array data.
 
 Also note that, as described above, when a command takes a scalar as
-input, that could be an element of a vector or array.  Likewise a
+input, that could also be an element of a vector or array.  Likewise a
 vector input could be a column of an array.
 
 +--------------------------------------------------------+----------------------------------------------+----------------------------------------------------+
diff --git a/doc/src/Howto_triclinic.rst b/doc/src/Howto_triclinic.rst
index 0efadbcc8c..2983d013c6 100644
--- a/doc/src/Howto_triclinic.rst
+++ b/doc/src/Howto_triclinic.rst
@@ -12,7 +12,8 @@ is created, e.g. by the :doc:`create_box <create_box>` or
 :doc:`read_data <read_data>` or :doc:`read_restart <read_restart>`
 commands.  Additionally, LAMMPS defines box size parameters lx,ly,lz
 where lx = xhi-xlo, and similarly in the y and z dimensions.  The 6
-parameters, as well as lx,ly,lz, can be output via the :doc:`thermo_style custom <thermo_style>` command.
+parameters, as well as lx,ly,lz, can be output via the
+:doc:`thermo_style custom <thermo_style>` command.
 
 LAMMPS also allows simulations to be performed in triclinic
 (non-orthogonal) simulation boxes shaped as a parallelepiped with
diff --git a/doc/src/Tools.rst b/doc/src/Tools.rst
index a42e7c56a5..49022a4ee9 100644
--- a/doc/src/Tools.rst
+++ b/doc/src/Tools.rst
@@ -702,11 +702,15 @@ Prerequisites and portability
 LAMMPS GUI is programmed in C++ based on the C++11 standard and using
 the `Qt GUI framework <https://www.qt.io/product/framework>`_.
 Currently, Qt version 5.12 or later is required; Qt 5.15LTS is
-recommended; Qt 6.x not (yet) supported.  Building LAMMPS with CMake is
-required.  The LAMMPS GUI has been successfully compiled and tested on:
+recommended; support for Qt version 6.x is under active development and
+thus far only tested with Qt 6.5LTS on Linux.  Building LAMMPS with
+CMake is required.
+
+The LAMMPS GUI has been successfully compiled and tested on:
 
 - Ubuntu Linux 20.04LTS x86_64 using GCC 9, Qt version 5.12
 - Fedora Linux 38 x86\_64 using GCC 13 and Clang 16, Qt version 5.15LTS
+- Fedora Linux 38 x86\_64 using GCC 13, Qt version 6.5LTS
 - Apple macOS 12 (Monterey) and macOS 13 (Ventura) with Xcode on arm64 and x86\_64, Qt version 5.15LTS
 - Windows 10 and 11 x86_64 with Visual Studio 2022 and Visual C++ 14.36, Qt version 5.15LTS
 - Windows 10 and 11 x86_64 with MinGW / GCC 10.0 cross-compiler on Fedora 38, Qt version 5.15LTS
@@ -717,7 +721,7 @@ required.  The LAMMPS GUI has been successfully compiled and tested on:
 Pre-compiled executables
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
-Pre-compiled LAMMPS executables including the GUI are currently
+Pre-compiled LAMMPS executable packages that include the GUI are currently
 available from https://download.lammps.org/static or
 https://github.com/lammps/lammps/releases.  You can unpack the archives
 (or mount the macOS disk image) and run the GUI directly in place. The
@@ -742,7 +746,10 @@ stored in a location where CMake can find them without additional help.
 Otherwise, the location of the Qt library installation must be indicated
 by setting ``-D Qt5_DIR=/path/to/qt5/lib/cmake/Qt5``, which is a path to
 a folder inside the Qt installation that contains the file
-``Qt5Config.cmake``.
+``Qt5Config.cmake``. Similarly, for Qt6 the location of the Qt library
+installation can be indicated by setting ``-D Qt6_DIR=/path/to/qt6/lib/cmake/Qt6``,
+if necessary.  When both, Qt5 and Qt6 are available, Qt6 will be preferred
+unless ``-D LAMMPS_GUI_USE_QT5=yes`` is set.
 
 It should be possible to build the LAMMPS GUI as a standalone
 compilation (e.g. when LAMMPS has been compiled with traditional make),
diff --git a/doc/src/atom_modify.rst b/doc/src/atom_modify.rst
index 1e5a3d49ff..21590e6680 100644
--- a/doc/src/atom_modify.rst
+++ b/doc/src/atom_modify.rst
@@ -65,6 +65,11 @@ switch.  This is described on the :doc:`Build_settings <Build_settings>`
 doc page.  If atom IDs are not used, they must be specified as 0 for
 all atoms, e.g. in a data or restart file.
 
+.. note::
+
+   If a :doc:`triclinic simulation box <Howto_triclinic>` is used,
+   atom IDs are required, due to how neighbor lists are built.
+
 The *map* keyword determines how atoms with specific IDs are found
 when required.  An example are the bond (angle, etc) methods which
 need to find the local index of an atom with a specific global ID
diff --git a/doc/src/compute.rst b/doc/src/compute.rst
index 363dbdbdaa..1cce59fbcc 100644
--- a/doc/src/compute.rst
+++ b/doc/src/compute.rst
@@ -27,58 +27,62 @@ Examples
 Description
 """""""""""
 
-Define a computation that will be performed on a group of atoms.
-Quantities calculated by a compute are instantaneous values, meaning
-they are calculated from information about atoms on the current
-timestep or iteration, though a compute may internally store some
-information about a previous state of the system.  Defining a compute
-does not perform a computation.  Instead computes are invoked by other
-LAMMPS commands as needed (e.g., to calculate a temperature needed for
-a thermostat fix or to generate thermodynamic or dump file output).
-See the :doc:`Howto output <Howto_output>` page for a summary of
-various LAMMPS output options, many of which involve computes.
+Define a diagnostic computation that will be performed on a group of
+atoms.  Quantities calculated by a compute are instantaneous values,
+meaning they are calculated from information about atoms on the
+current timestep or iteration, though internally a compute may store
+some information about a previous state of the system.  Defining a
+compute does not perform the computation.  Instead computes are
+invoked by other LAMMPS commands as needed (e.g., to calculate a
+temperature needed for a thermostat fix or to generate thermodynamic
+or dump file output).  See the :doc:`Howto output <Howto_output>` page
+for a summary of various LAMMPS output options, many of which involve
+computes.
 
 The ID of a compute can only contain alphanumeric characters and
 underscores.
 
 ----------
 
-Computes calculate one or more of four styles of quantities: global,
-per-atom, local, or per-atom.  A global quantity is one or more
-system-wide values, e.g. the temperature of the system.  A per-atom
-quantity is one or more values per atom, e.g. the kinetic energy of
-each atom.  Per-atom values are set to 0.0 for atoms not in the
-specified compute group.  Local quantities are calculated by each
-processor based on the atoms it owns, but there may be zero or more
-per atom, e.g. a list of bond distances.  Per-grid quantities are
-calculated on a regular 2d or 3d grid which overlays a 2d or 3d
-simulation domain.  The grid points and the data they store are
-distributed across processors; each processor owns the grid points
-which fall within its subdomain.
+Computes calculate and store any of four *styles* of quantities:
+global, per-atom, local, or per-grid.
 
-Computes that produce per-atom quantities have the word "atom" at the
-end of their style, e.g. *ke/atom*\ .  Computes that produce local
-quantities have the word "local" at the end of their style,
-e.g. *bond/local*\ .  Computes that produce per-grid quantities have
-the word "grid" at the end of their style, e.g. *property/grid*\ .
-Styles with neither "atom" or "local" or "grid" at the end of their
-style name produce global quantities.
+A global quantity is one or more system-wide values, e.g. the
+temperature of the system.  A per-atom quantity is one or more values
+per atom, e.g. the kinetic energy of each atom.  Per-atom values are
+set to 0.0 for atoms not in the specified compute group.  Local
+quantities are calculated by each processor based on the atoms it
+owns, but there may be zero or more per atom, e.g. a list of bond
+distances.  Per-grid quantities are calculated on a regular 2d or 3d
+grid which overlays a 2d or 3d simulation domain.  The grid points and
+the data they store are distributed across processors; each processor
+owns the grid points which fall within its subdomain.
 
-Note that a single compute typically produces either global or
-per-atom or local or per-grid values.  It does not compute both global
-and per-atom values.  It can produce local values or per-grid values
-in tandem with global or per-atom quantities.  The compute doc page
-will explain the details.
+As a general rule of thumb, computes that produce per-atom quantities
+have the word "atom" at the end of their style, e.g. *ke/atom*\ .
+Computes that produce local quantities have the word "local" at the
+end of their style, e.g. *bond/local*\ .  Computes that produce
+per-grid quantities have the word "grid" at the end of their style,
+e.g. *property/grid*\ .  And styles with neither "atom" or "local" or
+"grid" at the end of their style name produce global quantities.
 
-Global, per-atom, local, and per-grid quantities come in three kinds:
-a single scalar value, a vector of values, or a 2d array of values.
-The doc page for each compute describes the style and kind of values
-it produces, e.g. a per-atom vector.  Some computes produce more than
-one kind of a single style, e.g. a global scalar and a global vector.
+Global, per-atom, local, and per-grid quantities can also be of three
+*kinds*: a single scalar value (global only), a vector of values, or a
+2d array of values.  For per-atom, local, and per-grid quantities, a
+"vector" means a single value for each atom, each local entity
+(e.g. bond), or grid cell.  Likewise an "array", means multiple values
+for each atom, each local entity, or each grid cell.
 
-When a compute quantity is accessed, as in many of the output commands
-discussed below, it can be referenced via the following bracket
-notation, where ID is the ID of the compute:
+Note that a single compute can produce any combination of global,
+per-atom, local, or per-grid values.  Likewise it can prouduce any
+combination of scalar, vector, or array output for each style.  The
+exception is that for per-atom, local, and per-grid output, either a
+vector or array can be produced, but not both.  The doc page for each
+compute explains the values it produces.
+
+When a compute output is accessed by another input script command it
+is referenced via the following bracket notation, where ID is the ID
+of the compute:
 
 +-------------+--------------------------------------------+
 | c_ID        | entire scalar, vector, or array            |
@@ -89,17 +93,23 @@ notation, where ID is the ID of the compute:
 +-------------+--------------------------------------------+
 
 In other words, using one bracket reduces the dimension of the
-quantity once (vector :math:`\to` scalar, array :math:`\to` vector).  Using two
-brackets reduces the dimension twice (array :math:`\to` scalar).  Thus a
-command that uses scalar compute values as input can also process elements of a
-vector or array.
+quantity once (vector :math:`\to` scalar, array :math:`\to` vector).
+Using two brackets reduces the dimension twice (array :math:`\to`
+scalar).  Thus, for example, a command that uses global scalar compute
+values as input can also process elements of a vector or array.
+Depending on the command, this can either be done directly using the
+syntax in the table, or by first defining a :doc:`variable <variable>`
+of the appropriate style to store the quantity, then using the
+variable as an input to the command.
 
-Note that commands and :doc:`variables <variable>` which use compute
-quantities typically do not allow for all kinds (e.g., a command may
-require a vector of values, not a scalar).  This means there is no
-ambiguity about referring to a compute quantity as c_ID even if it
-produces, for example, both a scalar and vector.  The doc pages for
-various commands explain the details.
+Note that commands and :doc:`variables <variable>` which take compute
+outputs as input typically do not allow for all styles and kinds of
+data (e.g., a command may require global but not per-atom values, or
+it may require a vector of values, not a scalar).  This means there is
+typically no ambiguity about referring to a compute output as c_ID
+even if it produces, for example, both a scalar and vector.  The doc
+pages for various commands explain the details, including how any
+ambiguities are resolved.
 
 ----------
 
diff --git a/doc/src/compute_reduce.rst b/doc/src/compute_reduce.rst
index 204f1c090d..6820d2ee04 100644
--- a/doc/src/compute_reduce.rst
+++ b/doc/src/compute_reduce.rst
@@ -37,13 +37,16 @@ Syntax
        v_name = per-atom vector calculated by an atom-style variable with name
 
 * zero or more keyword/args pairs may be appended
-* keyword = *replace*
+* keyword = *replace* or *inputs*
 
   .. parsed-literal::
 
        *replace* args = vec1 vec2
          vec1 = reduced value from this input vector will be replaced
          vec2 = replace it with vec1[N] where N is index of max/min value from vec2
+       *inputs* arg = peratom or local
+         peratom = all inputs are per-atom quantities (default)
+         local = all input are local quantities
 
 Examples
 """"""""
@@ -60,38 +63,44 @@ Description
 """""""""""
 
 Define a calculation that "reduces" one or more vector inputs into
-scalar values, one per listed input.  The inputs can be per-atom or
-local quantities; they cannot be global quantities.  Atom attributes
-are per-atom quantities, :doc:`computes <compute>` and :doc:`fixes <fix>`
-may generate any of the three kinds of quantities, and :doc:`atom-style variables <variable>` generate per-atom quantities.  See the
-:doc:`variable <variable>` command and its special functions which can
-perform the same operations as the compute reduce command on global
-vectors.
+scalar values, one per listed input.  For the compute reduce command,
+the inputs can be either per-atom or local quantities and must all be
+of the same kind (per-atom or local); see discussion of the optional
+*inputs* keyword below.  The compute reduce/region command can only be
+used with per-atom inputs.
+
+Atom attributes are per-atom quantities, :doc:`computes <compute>` and
+:doc:`fixes <fix>` can generate either per-atom or local quantities,
+and :doc:`atom-style variables <variable>` generate per-atom
+quantities.  See the :doc:`variable <variable>` command and its
+special functions which can perform the same reduction operations as
+the compute reduce command on global vectors.
 
 The reduction operation is specified by the *mode* setting.  The *sum*
 option adds the values in the vector into a global total.  The *min*
 or *max* options find the minimum or maximum value across all vector
 values.  The *minabs* or *maxabs* options find the minimum or maximum
 value across all absolute vector values.  The *ave* setting adds the
-vector values into a global total, then divides by the number of values
-in the vector.  The *sumsq* option sums the square of the values in the
-vector into a global total.  The *avesq* setting does the same as *sumsq*,
-then divides the sum of squares by the number of values.  The last two options
-can be useful for calculating the variance of some quantity (e.g., variance =
-sumsq :math:`-` ave\ :math:`^2`).  The *sumabs* option sums the absolute
-values in the vector into a global total.  The *aveabs* setting does the same
-as *sumabs*, then divides the sum of absolute values by the number of
+vector values into a global total, then divides by the number of
+values in the vector.  The *sumsq* option sums the square of the
+values in the vector into a global total.  The *avesq* setting does
+the same as *sumsq*, then divides the sum of squares by the number of
+values.  The last two options can be useful for calculating the
+variance of some quantity (e.g., variance = sumsq :math:`-` ave\
+:math:`^2`).  The *sumabs* option sums the absolute values in the
+vector into a global total.  The *aveabs* setting does the same as
+*sumabs*, then divides the sum of absolute values by the number of
 values.
 
 Each listed input is operated on independently.  For per-atom inputs,
 the group specified with this command means only atoms within the
-group contribute to the result.  For per-atom inputs, if the compute
-reduce/region command is used, the atoms must also currently be within
-the region.  Note that an input that produces per-atom quantities may
-define its own group which affects the quantities it returns.  For
-example, if a compute is used as an input which generates a per-atom
-vector, it will generate values of 0.0 for atoms that are not in the
-group specified for that compute.
+group contribute to the result.  Likewise for per-atom inputs, if the
+compute reduce/region command is used, the atoms must also currently
+be within the region.  Note that an input that produces per-atom
+quantities may define its own group which affects the quantities it
+returns.  For example, if a compute is used as an input which
+generates a per-atom vector, it will generate values of 0.0 for atoms
+that are not in the group specified for that compute.
 
 Each listed input can be an atom attribute (position, velocity, force
 component) or can be the result of a :doc:`compute <compute>` or
@@ -123,52 +132,54 @@ array with six columns:
 
 ----------
 
-The atom attribute values (*x*, *y*, *z*, *vx*, *vy*, *vz*, *fx*, *fy*, and
-*fz*) are self-explanatory.  Note that other atom attributes can be used as
-inputs to this fix by using the
-:doc:`compute property/atom <compute_property_atom>` command and then specifying
-an input value from that compute.
+The atom attribute values (*x*, *y*, *z*, *vx*, *vy*, *vz*, *fx*,
+*fy*, and *fz*) are self-explanatory.  Note that other atom attributes
+can be used as inputs to this fix by using the :doc:`compute
+property/atom <compute_property_atom>` command and then specifying an
+input value from that compute.
 
 If a value begins with "c\_", a compute ID must follow which has been
-previously defined in the input script.  Computes can generate
-per-atom or local quantities.  See the individual
-:doc:`compute <compute>` page for details.  If no bracketed integer
-is appended, the vector calculated by the compute is used.  If a
-bracketed integer is appended, the Ith column of the array calculated
-by the compute is used.  Users can also write code for their own
-compute styles and :doc:`add them to LAMMPS <Modify>`.  See the
-discussion above for how :math:`I` can be specified with a wildcard asterisk
-to effectively specify multiple values.
+previously defined in the input script.  Valid computes can generate
+per-atom or local quantities.  See the individual :doc:`compute
+<compute>` page for details.  If no bracketed integer is appended, the
+vector calculated by the compute is used.  If a bracketed integer is
+appended, the Ith column of the array calculated by the compute is
+used.  Users can also write code for their own compute styles and
+:doc:`add them to LAMMPS <Modify>`.  See the discussion above for how
+:math:`I` can be specified with a wildcard asterisk to effectively
+specify multiple values.
 
 If a value begins with "f\_", a fix ID must follow which has been
-previously defined in the input script.  Fixes can generate per-atom
-or local quantities.  See the individual :doc:`fix <fix>` page for
-details.  Note that some fixes only produce their values on certain
-timesteps, which must be compatible with when compute reduce
+previously defined in the input script.  Valid fixes can generate
+per-atom or local quantities.  See the individual :doc:`fix <fix>`
+page for details.  Note that some fixes only produce their values on
+certain timesteps, which must be compatible with when compute reduce
 references the values, else an error results.  If no bracketed integer
 is appended, the vector calculated by the fix is used.  If a bracketed
 integer is appended, the Ith column of the array calculated by the fix
 is used.  Users can also write code for their own fix style and
 :doc:`add them to LAMMPS <Modify>`.  See the discussion above for how
-:math:`I` can be specified with a wildcard asterisk to effectively specify
-multiple values.
+:math:`I` can be specified with a wildcard asterisk to effectively
+specify multiple values.
 
 If a value begins with "v\_", a variable name must follow which has
 been previously defined in the input script.  It must be an
 :doc:`atom-style variable <variable>`.  Atom-style variables can
 reference thermodynamic keywords and various per-atom attributes, or
 invoke other computes, fixes, or variables when they are evaluated, so
-this is a very general means of generating per-atom quantities to reduce.
+this is a very general means of generating per-atom quantities to
+reduce.
 
 ----------
 
 If the *replace* keyword is used, two indices *vec1* and *vec2* are
-specified, where each index ranges from 1 to the number of input values.
-The replace keyword can only be used if the *mode* is *min* or *max*\ .
-It works as follows.  A min/max is computed as usual on the *vec2*
-input vector.  The index :math:`N` of that value within *vec2* is also stored.
-Then, instead of performing a min/max on the *vec1* input vector, the
-stored index is used to select the :math:`N`\ th element of the *vec1* vector.
+specified, where each index ranges from 1 to the number of input
+values.  The replace keyword can only be used if the *mode* is *min*
+or *max*\ .  It works as follows.  A min/max is computed as usual on
+the *vec2* input vector.  The index :math:`N` of that value within
+*vec2* is also stored.  Then, instead of performing a min/max on the
+*vec1* input vector, the stored index is used to select the :math:`N`\
+th element of the *vec1* vector.
 
 Thus, for example, if you wish to use this compute to find the bond
 with maximum stretch, you can do it as follows:
@@ -190,6 +201,16 @@ information in this context, the *replace* keywords will extract the
 atom IDs for the two atoms in the bond of maximum stretch.  These atom
 IDs and the bond stretch will be printed with thermodynamic output.
 
+.. versionadded:: TBD
+
+The *inputs* keyword allows selection of whether all the inputs are
+per-atom or local quantities.  As noted above, all the inputs must be
+the same kind (per-atom or local).  Per-atom is the default setting.
+If a compute or fix is specified as an input, it must produce per-atom
+or local data to match this setting.  If it produces both, e.g. for
+the :doc:`compute voronoi/atom <compute_voronoi_atom>` command, then
+this keyword selects between them.
+
 ----------
 
 If a single input is specified this compute produces a global scalar
@@ -197,38 +218,41 @@ value.  If multiple inputs are specified, this compute produces a
 global vector of values, the length of which is equal to the number of
 inputs specified.
 
-As discussed below, for the *sum*, *sumabs*, and *sumsq* modes, the value(s)
-produced by this compute are all "extensive", meaning their value
-scales linearly with the number of atoms involved.  If normalized
-values are desired, this compute can be accessed by the
+As discussed below, for the *sum*, *sumabs*, and *sumsq* modes, the
+value(s) produced by this compute are all "extensive", meaning their
+value scales linearly with the number of atoms involved.  If
+normalized values are desired, this compute can be accessed by the
 :doc:`thermo_style custom <thermo_style>` command with
-:doc:`thermo_modify norm yes <thermo_modify>` set as an option.
-Or it can be accessed by a
-:doc:`variable <variable>` that divides by the appropriate atom count.
+:doc:`thermo_modify norm yes <thermo_modify>` set as an option.  Or it
+can be accessed by a :doc:`variable <variable>` that divides by the
+appropriate atom count.
 
 ----------
 
 Output info
 """""""""""
 
-This compute calculates a global scalar if a single input value is specified
-or a global vector of length :math:`N`, where :math:`N` is the number of
-inputs, and which can be accessed by indices 1 to :math:`N`.  These values can
-be used by any command that uses global scalar or vector values from a
-compute as input.  See the :doc:`Howto output <Howto_output>` doc page
-for an overview of LAMMPS output options.
+This compute calculates a global scalar if a single input value is
+specified or a global vector of length :math:`N`, where :math:`N` is
+the number of inputs, and which can be accessed by indices 1 to
+:math:`N`.  These values can be used by any command that uses global
+scalar or vector values from a compute as input.  See the :doc:`Howto
+output <Howto_output>` doc page for an overview of LAMMPS output
+options.
 
 All the scalar or vector values calculated by this compute are
 "intensive", except when the *sum*, *sumabs*, or *sumsq* modes are used on
 per-atom or local vectors, in which case the calculated values are
 "extensive".
 
-The scalar or vector values will be in whatever :doc:`units <units>` the
-quantities being reduced are in.
+The scalar or vector values will be in whatever :doc:`units <units>`
+the quantities being reduced are in.
 
 Restrictions
 """"""""""""
- none
+
+As noted above, the compute reduce/region command can only be used
+with per-atom inputs.
 
 Related commands
 """"""""""""""""
@@ -238,4 +262,4 @@ Related commands
 Default
 """""""
 
-none
+The default value for the *inputs* keyword is peratom.
diff --git a/doc/src/compute_voronoi_atom.rst b/doc/src/compute_voronoi_atom.rst
index 274be1b702..37e5386341 100644
--- a/doc/src/compute_voronoi_atom.rst
+++ b/doc/src/compute_voronoi_atom.rst
@@ -13,7 +13,7 @@ Syntax
 * ID, group-ID are documented in :doc:`compute <compute>` command
 * voronoi/atom = style name of this compute command
 * zero or more keyword/value pairs may be appended
-* keyword = *only_group* or *occupation* or *surface* or *radius* or *edge_histo* or *edge_threshold* or *face_threshold* or *neighbors* or *peratom*
+* keyword = *only_group* or *occupation* or *surface* or *radius* or *edge_histo* or *edge_threshold* or *face_threshold* or *neighbors*
 
   .. parsed-literal::
 
@@ -31,7 +31,6 @@ Syntax
        *face_threshold* arg = minarea
          minarea = minimum area for a face to be counted
        *neighbors* value = *yes* or *no* = store list of all neighbors or no
-       *peratom* value = *yes* or *no* = per-atom quantities accessible or no
 
 Examples
 """"""""
@@ -53,14 +52,12 @@ atoms in the simulation box.  The tessellation is calculated using all
 atoms in the simulation, but non-zero values are only stored for atoms
 in the group.
 
-By default two per-atom quantities are calculated by this compute.
-The first is the volume of the Voronoi cell around each atom.  Any
-point in an atom's Voronoi cell is closer to that atom than any other.
-The second is the number of faces of the Voronoi cell. This is
-equal to the number of nearest neighbors of the central atom,
-plus any exterior faces (see note below). If the *peratom* keyword
-is set to "no", the per-atom quantities are still calculated,
-but they are not accessible.
+Two per-atom quantities are calculated by this compute.  The first is
+the volume of the Voronoi cell around each atom.  Any point in an
+atom's Voronoi cell is closer to that atom than any other.  The second
+is the number of faces of the Voronoi cell. This is equal to the
+number of nearest neighbors of the central atom, plus any exterior
+faces (see note below).
 
 ----------
 
@@ -97,13 +94,13 @@ present in atom_style sphere for granular models.
 
 The *edge_histo* keyword activates the compilation of a histogram of
 number of edges on the faces of the Voronoi cells in the compute
-group. The argument *maxedge* of the this keyword is the largest number
-of edges on a single Voronoi cell face expected to occur in the
-sample. This keyword adds the generation of a global vector with
-*maxedge*\ +1 entries. The last entry in the vector contains the number of
-faces with more than *maxedge* edges. Since the polygon with the
-smallest amount of edges is a triangle, entries 1 and 2 of the vector
-will always be zero.
+group. The argument *maxedge* of the this keyword is the largest
+number of edges on a single Voronoi cell face expected to occur in the
+sample. This keyword generates output of a global vector by this
+compute with *maxedge*\ +1 entries. The last entry in the vector
+contains the number of faces with more than *maxedge* edges. Since the
+polygon with the smallest amount of edges is a triangle, entries 1 and
+2 of the vector will always be zero.
 
 The *edge_threshold* and *face_threshold* keywords allow the
 suppression of edges below a given minimum length and faces below a
@@ -127,8 +124,8 @@ to locate vacancies (the coordinates are given by the atom coordinates
 at the time step when the compute was first invoked), while column two
 data can be used to identify interstitial atoms.
 
-If the *neighbors* value is set to yes, then this compute creates a
-local array with 3 columns. There is one row for each face of each
+If the *neighbors* value is set to yes, then this compute also creates
+a local array with 3 columns. There is one row for each face of each
 Voronoi cell. The 3 columns are the atom ID of the atom that owns the
 cell, the atom ID of the atom in the neighboring cell (or zero if the
 face is external), and the area of the face.  The array can be
@@ -143,8 +140,8 @@ containing all the Voronoi neighbors in a system:
    compute 6 all voronoi/atom neighbors yes
    dump d2 all local 1 dump.neighbors index c_6[1] c_6[2] c_6[3]
 
-If the *face_threshold* keyword is used, then only faces
-with areas greater than the threshold are stored.
+If the *face_threshold* keyword is used, then only faces with areas
+greater than the threshold are stored.
 
 ----------
 
@@ -158,48 +155,52 @@ Voro++ software in the src/VORONOI/README file.
 
 .. note::
 
-   The calculation of Voronoi volumes is performed by each processor for
-   the atoms it owns, and includes the effect of ghost atoms stored by
-   the processor.  This assumes that the Voronoi cells of owned atoms
-   are not affected by atoms beyond the ghost atom cut-off distance.
-   This is usually a good assumption for liquid and solid systems, but
-   may lead to underestimation of Voronoi volumes in low density
-   systems.  By default, the set of ghost atoms stored by each processor
-   is determined by the cutoff used for :doc:`pair_style <pair_style>`
-   interactions.  The cutoff can be set explicitly via the
-   :doc:`comm_modify cutoff <comm_modify>` command.  The Voronoi cells
-   for atoms adjacent to empty regions will extend into those regions up
-   to the communication cutoff in :math:`x`, :math:`y`, or :math:`z`.
-   In that situation, an exterior face is created at the cutoff distance
-   normal to the :math:`x`, :math:`y`, or :math:`z` direction.  For
-   triclinic systems, the exterior face is parallel to the corresponding
-   reciprocal lattice vector.
+   The calculation of Voronoi volumes is performed by each processor
+   for the atoms it owns, and includes the effect of ghost atoms
+   stored by the processor.  This assumes that the Voronoi cells of
+   owned atoms are not affected by atoms beyond the ghost atom cut-off
+   distance.  This is usually a good assumption for liquid and solid
+   systems, but may lead to underestimation of Voronoi volumes in low
+   density systems.  By default, the set of ghost atoms stored by each
+   processor is determined by the cutoff used for :doc:`pair_style
+   <pair_style>` interactions.  The cutoff can be set explicitly via
+   the :doc:`comm_modify cutoff <comm_modify>` command.  The Voronoi
+   cells for atoms adjacent to empty regions will extend into those
+   regions up to the communication cutoff in :math:`x`, :math:`y`, or
+   :math:`z`.  In that situation, an exterior face is created at the
+   cutoff distance normal to the :math:`x`, :math:`y`, or :math:`z`
+   direction.  For triclinic systems, the exterior face is parallel to
+   the corresponding reciprocal lattice vector.
 
 .. note::
 
-   The Voro++ package performs its calculation in 3d.  This will
-   still work for a 2d LAMMPS simulation, provided all the atoms have the
-   same :math:`z`-coordinate. The Voronoi cell of each atom will be a columnar
-   polyhedron with constant cross-sectional area along the :math:`z`-direction
-   and two exterior faces at the top and bottom of the simulation box. If
-   the atoms do not all have the same :math:`z`-coordinate, then the columnar
-   cells will be accordingly distorted. The cross-sectional area of each
-   Voronoi cell can be obtained by dividing its volume by the :math:`z` extent
-   of the simulation box.  Note that you define the :math:`z` extent of the
-   simulation box for 2d simulations when using the
-   :doc:`create_box <create_box>` or :doc:`read_data <read_data>` commands.
+   The Voro++ package performs its calculation in 3d.  This will still
+   work for a 2d LAMMPS simulation, provided all the atoms have the
+   same :math:`z`-coordinate. The Voronoi cell of each atom will be a
+   columnar polyhedron with constant cross-sectional area along the
+   :math:`z`-direction and two exterior faces at the top and bottom of
+   the simulation box. If the atoms do not all have the same
+   :math:`z`-coordinate, then the columnar cells will be accordingly
+   distorted. The cross-sectional area of each Voronoi cell can be
+   obtained by dividing its volume by the :math:`z` extent of the
+   simulation box.  Note that you define the :math:`z` extent of the
+   simulation box for 2d simulations when using the :doc:`create_box
+   <create_box>` or :doc:`read_data <read_data>` commands.
 
 Output info
 """""""""""
 
-By default, this compute calculates a per-atom array with two
-columns. In regular dynamic tessellation mode the first column is the
-Voronoi volume, the second is the neighbor count, as described above
-(read above for the output data in case the *occupation* keyword is
-specified).  These values can be accessed by any command that uses
-per-atom values from a compute as input.  See the :doc:`Howto output <Howto_output>` page for an overview of LAMMPS output
-options. If the *peratom* keyword is set to "no", the per-atom array
-is still created, but it is not accessible.
+.. deprecated:: TBD
+
+   The *peratom* keyword was removed as it is no longer required.
+
+This compute calculates a per-atom array with two columns. In regular
+dynamic tessellation mode the first column is the Voronoi volume, the
+second is the neighbor count, as described above (read above for the
+output data in case the *occupation* keyword is specified).  These
+values can be accessed by any command that uses per-atom values from a
+compute as input.  See the :doc:`Howto output <Howto_output>` page for
+an overview of LAMMPS output options.
 
 If the *edge_histo* keyword is used, then this compute generates a
 global vector of length *maxedge*\ +1, containing a histogram of the
@@ -209,17 +210,6 @@ If the *neighbors* value is set to *yes*, then this compute calculates a
 local array with three columns. There is one row for each face of each
 Voronoi cell.
 
-.. note::
-
-   Some LAMMPS commands such as the :doc:`compute reduce <compute_reduce>`
-   command can accept either a per-atom or local quantity. If this compute
-   produces both quantities, the command
-   may access the per-atom quantity, even if you want to access the local
-   quantity.  This effect can be eliminated by using the *peratom*
-   keyword to turn off the production of the per-atom quantities.  For
-   the default value *yes* both quantities are produced.  For the value
-   *no*, only the local array is produced.
-
 The Voronoi cell volume will be in distance :doc:`units <units>` cubed.
 The Voronoi face area will be in distance :doc:`units <units>` squared.
 
@@ -227,7 +217,8 @@ Restrictions
 """"""""""""
 
 This compute is part of the VORONOI package.  It is only enabled if
-LAMMPS was built with that package.  See the :doc:`Build package <Build_package>` page for more info.
+LAMMPS was built with that package.  See the :doc:`Build package
+<Build_package>` page for more info.
 
 It also requires you have a copy of the Voro++ library built and
 installed on your system.  See instructions on obtaining and
@@ -241,5 +232,4 @@ Related commands
 Default
 """""""
 
-*neighbors* no, *peratom* yes
-
+The default for the neighobrs keyword is no.
diff --git a/doc/src/fix.rst b/doc/src/fix.rst
index cffa482971..3dd7e224e7 100644
--- a/doc/src/fix.rst
+++ b/doc/src/fix.rst
@@ -77,35 +77,44 @@ for individual fixes for info on which ones can be restarted.
 
 ----------
 
-Some fixes calculate one or more of four styles of quantities: global,
-per-atom, local, or per-grid, which can be used by other commands or
-output as described below.  A global quantity is one or more
-system-wide values, e.g. the energy of a wall interacting with
-particles.  A per-atom quantity is one or more values per atom,
-e.g. the displacement vector for each atom since time 0.  Per-atom
-values are set to 0.0 for atoms not in the specified fix group.  Local
-quantities are calculated by each processor based on the atoms it
-owns, but there may be zero or more per atoms.  Per-grid quantities
-are calculated on a regular 2d or 3d grid which overlays a 2d or 3d
-simulation domain.  The grid points and the data they store are
-distributed across processors; each processor owns the grid points
-which fall within its subdomain.
+Some fixes calculate and store any of four *styles* of quantities:
+global, per-atom, local, or per-grid.
 
-Note that a single fix typically produces either global or per-atom or
-local or per-grid values (or none at all).  It does not produce both
-global and per-atom.  It can produce local or per-grid values in
-tandem with global or per-atom values.  The fix doc page will explain
-the details.
+A global quantity is one or more system-wide values, e.g. the energy
+of a wall interacting with particles.  A per-atom quantity is one or
+more values per atom, e.g. the original coordinates of each atom at
+time 0.  Per-atom values are set to 0.0 for atoms not in the specified
+fix group.  Local quantities are calculated by each processor based on
+the atoms it owns, but there may be zero or more per atom, e.g. values
+for each bond.  Per-grid quantities are calculated on a regular 2d or
+3d grid which overlays a 2d or 3d simulation domain.  The grid points
+and the data they store are distributed across processors; each
+processor owns the grid points which fall within its subdomain.
 
-Global, per-atom, local, and per-grid quantities come in three kinds:
-a single scalar value, a vector of values, or a 2d array of values.
-The doc page for each fix describes the style and kind of values it
-produces, e.g. a per-atom vector.  Some fixes produce more than one
-kind of a single style, e.g. a global scalar and a global vector.
+As a general rule of thumb, fixes that produce per-atom quantities
+have the word "atom" at the end of their style, e.g. *ave/atom*\ .
+Fixes that produce local quantities have the word "local" at the end
+of their style, e.g. *store/local*\ .  Fixes that produce per-grid
+quantities have the word "grid" at the end of their style,
+e.g. *ave/grid*\ .
 
-When a fix quantity is accessed, as in many of the output commands
-discussed below, it can be referenced via the following bracket
-notation, where ID is the ID of the fix:
+Global, per-atom, local, and per-grid quantities can also be of three
+*kinds*: a single scalar value (global only), a vector of values, or a
+2d array of values.  For per-atom, local, and per-grid quantities, a
+"vector" means a single value for each atom, each local entity
+(e.g. bond), or grid cell.  Likewise an "array", means multiple values
+for each atom, each local entity, or each grid cell.
+
+Note that a single fix can produce any combination of global,
+per-atom, local, or per-grid values.  Likewise it can prouduce any
+combination of scalar, vector, or array output for each style.  The
+exception is that for per-atom, local, and per-grid output, either a
+vector or array can be produced, but not both.  The doc page for each
+fix explains the values it produces, if any.
+
+When a fix output is accessed by another input script command it is
+referenced via the following bracket notation, where ID is the ID of
+the fix:
 
 +-------------+--------------------------------------------+
 | f_ID        | entire scalar, vector, or array            |
@@ -116,19 +125,23 @@ notation, where ID is the ID of the fix:
 +-------------+--------------------------------------------+
 
 In other words, using one bracket reduces the dimension of the
-quantity once (vector :math:`\to` scalar, array :math:`\to` vector).  Using two
-brackets reduces the dimension twice (array :math:`\to` scalar).  Thus, a
-command that uses scalar fix values as input can also process elements of a
-vector or array.
+quantity once (vector :math:`\to` scalar, array :math:`\to` vector).
+Using two brackets reduces the dimension twice (array :math:`\to`
+scalar).  Thus, for example, a command that uses global scalar fix
+values as input can also process elements of a vector or array.
+Depending on the command, this can either be done directly using the
+syntax in the table, or by first defining a :doc:`variable <variable>`
+of the appropriate style to store the quantity, then using the
+variable as an input to the command.
 
-Note that commands and :doc:`variables <variable>` that use fix
-quantities typically do not allow for all kinds (e.g., a command may
-require a vector of values, not a scalar), and even if they do, the context
-in which they are called can be used to resolve which output is being
-requested.  This means there is no
-ambiguity about referring to a fix quantity as f_ID even if it
-produces, for example, both a scalar and vector.  The doc pages for
-various commands explain the details.
+Note that commands and :doc:`variables <variable>` which take fix
+outputs as input typically do not allow for all styles and kinds of
+data (e.g., a command may require global but not per-atom values, or
+it may require a vector of values, not a scalar).  This means there is
+typically no ambiguity about referring to a fix output as c_ID even if
+it produces, for example, both a scalar and vector.  The doc pages for
+various commands explain the details, including how any ambiguities
+are resolved.
 
 ----------
 
diff --git a/doc/src/fix_ave_histo.rst b/doc/src/fix_ave_histo.rst
index 8bb66f0615..31e5476f9e 100644
--- a/doc/src/fix_ave_histo.rst
+++ b/doc/src/fix_ave_histo.rst
@@ -79,9 +79,10 @@ Description
 
 Use one or more values as inputs every few timesteps to create a
 single histogram.  The histogram can then be averaged over longer
-timescales.  The resulting histogram can be used by other :doc:`output commands <Howto_output>`, and can also be written to a file.  The
-fix ave/histo/weight command has identical syntax to fix ave/histo,
-except that exactly two values must be specified.  See details below.
+timescales.  The resulting histogram can be used by other :doc:`output
+commands <Howto_output>`, and can also be written to a file.  The fix
+ave/histo/weight command has identical syntax to fix ave/histo, except
+that exactly two values must be specified.  See details below.
 
 The group specified with this command is ignored for global and local
 input values.  For per-atom input values, only atoms in the group
@@ -96,14 +97,18 @@ different ways; see the discussion of the *beyond* keyword below.
 
 Each input value can be an atom attribute (position, velocity, force
 component) or can be the result of a :doc:`compute <compute>` or
-:doc:`fix <fix>` or the evaluation of an equal-style or vector-style or
-atom-style :doc:`variable <variable>`.  The set of input values can be
-either all global, all per-atom, or all local quantities.  Inputs of
-different kinds (e.g. global and per-atom) cannot be mixed.  Atom
-attributes are per-atom vector values.  See the page for
-individual "compute" and "fix" commands to see what kinds of
-quantities they generate.  See the optional *kind* keyword below for
-how to force the fix ave/histo command to disambiguate if necessary.
+:doc:`fix <fix>` or the evaluation of an equal-style or vector-style
+or atom-style :doc:`variable <variable>`.  The set of input values can
+be either all global, all per-atom, or all local quantities.  Inputs
+of different kinds (e.g. global and per-atom) cannot be mixed.  Atom
+attributes are per-atom vector values.  See the page for individual
+"compute" and "fix" commands to see what kinds of quantities they
+generate.
+
+Note that a compute or fix can produce multiple kinds of data (global,
+per-atom, local).  If LAMMPS cannot unambiguosly determine which kind
+of data to use, the optional *kind* keyword discussed below can force
+the desired disambiguation.
 
 Note that the output of this command is a single histogram for all
 input values combined together, not one histogram per input value.
@@ -258,13 +263,14 @@ keyword is set to *vector*, then all input values must be global or
 per-atom or local vectors, or columns of global or per-atom or local
 arrays.
 
-The *kind* keyword only needs to be set if a compute or fix produces
-more than one kind of output (global, per-atom, local).  If this is
-not the case, then LAMMPS will determine what kind of input is
-provided and whether all the input arguments are consistent.  If a
-compute or fix produces more than one kind of output, the *kind*
-keyword should be used to specify which output will be used.  The
-remaining input arguments must still be consistent.
+The *kind* keyword only needs to be used if any of the specfied input
+computes or fixes produce more than one kind of output (global,
+per-atom, local).  If not, LAMMPS will determine the kind of data all
+the inputs produce and verify it is all the same kind.  If not, an
+error will be triggered.  If a compute or fix produces more than one
+kind of output, the *kind* keyword should be used to specify which
+output will be used.  The other input arguments must still be
+consistent.
 
 The *beyond* keyword determines how input values that fall outside the
 *lo* to *hi* bounds are treated.  Values such that *lo* :math:`\le` value
diff --git a/doc/src/fix_efield.rst b/doc/src/fix_efield.rst
index 2958d89794..a870590856 100644
--- a/doc/src/fix_efield.rst
+++ b/doc/src/fix_efield.rst
@@ -1,4 +1,5 @@
 .. index:: fix efield
+.. index:: fix efield/kk
 .. index:: fix efield/tip4p
 
 fix efield command
@@ -210,6 +211,12 @@ the iteration count during the minimization.
    system (the quantity being minimized), you MUST enable the
    :doc:`fix_modify <fix_modify>` *energy* option for this fix.
 
+----------
+
+.. include:: accel_styles.rst
+
+----------
+
 Restrictions
 """"""""""""
 
diff --git a/doc/src/fix_rigid.rst b/doc/src/fix_rigid.rst
index 89759da817..a50e215681 100644
--- a/doc/src/fix_rigid.rst
+++ b/doc/src/fix_rigid.rst
@@ -843,7 +843,7 @@ stress/atom <compute_stress_atom>` commands.  The former can be
 accessed by :doc:`thermodynamic output <thermo_style>`.  The default
 setting for this fix is :doc:`fix_modify virial yes <fix_modify>`.
 
-All of the *rigid* styles (not the *rigid/small* styles) compute a
+All of the *rigid* styles (but not the *rigid/small* styles) compute a
 global array of values which can be accessed by various :doc:`output
 commands <Howto_output>`.  Similar information about the bodies
 defined by the *rigid/small* styles can be accessed via the
@@ -887,7 +887,8 @@ Restrictions
 """"""""""""
 
 These fixes are all part of the RIGID package.  It is only enabled if
-LAMMPS was built with that package.  See the :doc:`Build package <Build_package>` page for more info.
+LAMMPS was built with that package.  See the :doc:`Build package
+<Build_package>` page for more info.
 
 Assigning a temperature via the :doc:`velocity create <velocity>`
 command to a system with :doc:`rigid bodies <fix_rigid>` may not have
diff --git a/doc/src/fix_spring_self.rst b/doc/src/fix_spring_self.rst
index 3383f27ebb..4453fd61c5 100644
--- a/doc/src/fix_spring_self.rst
+++ b/doc/src/fix_spring_self.rst
@@ -1,4 +1,5 @@
 .. index:: fix spring/self
+.. index:: fix spring/self/kk
 
 fix spring/self command
 =======================
@@ -80,6 +81,12 @@ invoked by the :doc:`minimize <minimize>` command.
    you MUST enable the :doc:`fix_modify <fix_modify>` *energy* option for
    this fix.
 
+----------
+
+.. include:: accel_styles.rst
+
+----------
+
 Restrictions
 """"""""""""
  none
diff --git a/doc/src/fix_srd.rst b/doc/src/fix_srd.rst
index 1fc574a7ad..8bfbcf2387 100644
--- a/doc/src/fix_srd.rst
+++ b/doc/src/fix_srd.rst
@@ -71,14 +71,15 @@ imbue the SRD particles with fluid-like properties, including an
 effective viscosity.  Thus simulations with large solute particles can
 be run more quickly, to measure solute properties like diffusivity
 and viscosity in a background fluid.  The usual LAMMPS fixes for such
-simulations, such as :doc:`fix deform <fix_deform>`, :doc:`fix viscosity <fix_viscosity>`, and :doc:`fix nvt/sllod <fix_nvt_sllod>`,
+simulations, such as :doc:`fix deform <fix_deform>`,
+:doc:`fix viscosity <fix_viscosity>`, and :doc:`fix nvt/sllod <fix_nvt_sllod>`,
 can be used in conjunction with the SRD model.
 
-For more details on how the SRD model is implemented in LAMMPS, :ref:`this paper <Petersen1>` describes the implementation and usage of pure SRD
-fluids.  :ref:`This paper <Lechman>`, which is nearly complete, describes
-the implementation and usage of mixture systems (solute particles in
-an SRD fluid).  See the examples/srd directory for sample input
-scripts using SRD particles in both settings.
+For more details on how the SRD model is implemented in LAMMPS,
+:ref:`(Petersen) <Petersen1>` describes the implementation and usage of
+pure SRD fluids.  See the ``examples/srd`` directory for sample input
+scripts using SRD particles for that and for mixture systems (solute
+particles in an SRD fluid).
 
 This fix does two things:
 
@@ -357,28 +358,28 @@ These are the 12 quantities.  All are values for the current timestep,
 except for quantity 5 and the last three, each of which are
 cumulative quantities since the beginning of the run.
 
-* (1) # of SRD/big collision checks performed
-* (2) # of SRDs which had a collision
-* (3) # of SRD/big collisions (including multiple bounces)
-* (4) # of SRD particles inside a big particle
-* (5) # of SRD particles whose velocity was rescaled to be < Vmax
-* (6) # of bins for collision searching
-* (7) # of bins for SRD velocity rotation
-* (8) # of bins in which SRD temperature was computed
-* (9) SRD temperature
-* (10) # of SRD particles which have undergone max # of bounces
-* (11) max # of bounces any SRD particle has had in a single step
-* (12) # of reneighborings due to SRD particles moving too far
+(1) # of SRD/big collision checks performed
+(2) # of SRDs which had a collision
+(3) # of SRD/big collisions (including multiple bounces)
+(4) # of SRD particles inside a big particle
+(5) # of SRD particles whose velocity was rescaled to be < Vmax
+(6) # of bins for collision searching
+(7) # of bins for SRD velocity rotation
+(8) # of bins in which SRD temperature was computed
+(9) SRD temperature
+(10) # of SRD particles which have undergone max # of bounces
+(11) max # of bounces any SRD particle has had in a single step
+(12) # of reneighborings due to SRD particles moving too far
 
 No parameter of this fix can be used with the *start/stop* keywords of
-the :doc:`run <run>` command.  This fix is not invoked during :doc:`energy minimization <minimize>`.
+the :doc:`run <run>` command.  This fix is not invoked during
+:doc:`energy minimization <minimize>`.
 
 Restrictions
 """"""""""""
 
-This command can only be used if LAMMPS was built with the SRD
-package.  See the :doc:`Build package <Build_package>` doc
-page for more info.
+This command can only be used if LAMMPS was built with the SRD package.
+See the :doc:`Build package <Build_package>` doc page for more info.
 
 Related commands
 """"""""""""""""
@@ -403,7 +404,3 @@ no, and rescale = yes.
 
 **(Petersen)** Petersen, Lechman, Plimpton, Grest, in' t Veld, Schunk, J
 Chem Phys, 132, 174106 (2010).
-
-.. _Lechman:
-
-**(Lechman)** Lechman, et al, in preparation (2010).
diff --git a/doc/src/pair_ilp_tmd.rst b/doc/src/pair_ilp_tmd.rst
index 482d75a100..70a4768389 100644
--- a/doc/src/pair_ilp_tmd.rst
+++ b/doc/src/pair_ilp_tmd.rst
@@ -22,12 +22,12 @@ Examples
 .. code-block:: LAMMPS
 
    pair_style  hybrid/overlay ilp/tmd 16.0 1
-   pair_coeff  * * ilp/tmd  TMD.ILP Mo S S
+   pair_coeff  * * ilp/tmd  MoS2.ILP Mo S S
 
    pair_style  hybrid/overlay sw/mod sw/mod ilp/tmd 16.0
    pair_coeff  * * sw/mod 1  tmd.sw.mod Mo S S NULL NULL NULL
    pair_coeff  * * sw/mod 2  tmd.sw.mod NULL NULL NULL Mo S S
-   pair_coeff  * * ilp/tmd   TMD.ILP    Mo S S Mo S S
+   pair_coeff  * * ilp/tmd   MoS2.ILP   Mo S S Mo S S
 
 Description
 """""""""""
@@ -69,7 +69,7 @@ calculating the normals.
    each atom `i`, its six nearest neighboring atoms belonging to the same
    sub-layer are chosen to define the normal vector `{\bf n}_i`.
 
-The parameter file (e.g. TMD.ILP), is intended for use with *metal*
+The parameter file (e.g. MoS2.ILP), is intended for use with *metal*
 :doc:`units <units>`, with energies in meV. Two additional parameters,
 *S*, and *rcut* are included in the parameter file. *S* is designed to
 facilitate scaling of energies. *rcut* is designed to build the neighbor
@@ -77,7 +77,7 @@ list for calculating the normals for each atom pair.
 
 .. note::
 
-   The parameters presented in the parameter file (e.g. TMD.ILP),
+   The parameters presented in the parameter file (e.g. MoS2.ILP),
    are fitted with taper function by setting the cutoff equal to 16.0
    Angstrom.  Using different cutoff or taper function should be careful.
    These parameters provide a good description in both short- and long-range
@@ -133,10 +133,10 @@ if LAMMPS was built with that package.  See the :doc:`Build package
 This pair style requires the newton setting to be *on* for pair
 interactions.
 
-The TMD.ILP potential file provided with LAMMPS (see the potentials
+The MoS2.ILP potential file provided with LAMMPS (see the potentials
 directory) are parameterized for *metal* units.  You can use this
 potential with any LAMMPS units, but you would need to create your own
-custom TMD.ILP potential file with coefficients listed in the appropriate
+custom MoS2.ILP potential file with coefficients listed in the appropriate
 units, if your simulation does not use *metal* units.
 
 Related commands
diff --git a/doc/src/pair_reaxff.rst b/doc/src/pair_reaxff.rst
index 4dac9baf85..067eb3afc3 100644
--- a/doc/src/pair_reaxff.rst
+++ b/doc/src/pair_reaxff.rst
@@ -43,22 +43,22 @@ Examples
 Description
 """""""""""
 
-Style *reaxff* computes the ReaxFF potential of van Duin, Goddard and
-co-workers.  ReaxFF uses distance-dependent bond-order functions to
+Pair style *reaxff* computes the ReaxFF potential of van Duin, Goddard
+and co-workers.  ReaxFF uses distance-dependent bond-order functions to
 represent the contributions of chemical bonding to the potential
-energy. There is more than one version of ReaxFF. The version
+energy.  There is more than one version of ReaxFF.  The version
 implemented in LAMMPS uses the functional forms documented in the
 supplemental information of the following paper:
-:ref:`(Chenoweth et al., 2008) <Chenoweth_20082>`.  The version integrated
-into LAMMPS matches the version of ReaxFF From Summer 2010.  For more
-technical details about the pair reaxff implementation of ReaxFF, see
-the :ref:`(Aktulga) <Aktulga>` paper. The *reaxff* style was initially
-implemented as a stand-alone C code and is now converted to C++ and
-integrated into LAMMPS as a package.
+:ref:`(Chenoweth et al., 2008) <Chenoweth_20082>` and matches the
+version of the reference ReaxFF implementation from Summer 2010.  For
+more technical details about the implementation of ReaxFF in pair style
+*reaxff*, see the :ref:`(Aktulga) <Aktulga>` paper. The *reaxff* style
+was initially implemented as a stand-alone C code and is now converted
+to C++ and integrated into LAMMPS as a package.
 
 The *reaxff/kk* style is a Kokkos version of the ReaxFF potential that
-is derived from the *reaxff* style. The Kokkos version can run on GPUs
-and can also use OpenMP multithreading. For more information about the
+is derived from the *reaxff* style.  The Kokkos version can run on GPUs
+and can also use OpenMP multithreading.  For more information about the
 Kokkos package, see :doc:`Packages details <Packages_details>` and
 :doc:`Speed kokkos <Speed_kokkos>` doc pages.  One important
 consideration when using the *reaxff/kk* style is the choice of either
diff --git a/doc/src/pair_yukawa_colloid.rst b/doc/src/pair_yukawa_colloid.rst
index 6611ea04e4..c6f201d249 100644
--- a/doc/src/pair_yukawa_colloid.rst
+++ b/doc/src/pair_yukawa_colloid.rst
@@ -1,11 +1,12 @@
 .. index:: pair_style yukawa/colloid
 .. index:: pair_style yukawa/colloid/gpu
+.. index:: pair_style yukawa/colloid/kk
 .. index:: pair_style yukawa/colloid/omp
 
 pair_style yukawa/colloid command
 =================================
 
-Accelerator Variants: *yukawa/colloid/gpu*, *yukawa/colloid/omp*
+Accelerator Variants: *yukawa/colloid/gpu*, *yukawa/colloid/kk*, *yukawa/colloid/omp*
 
 Syntax
 """"""
@@ -131,6 +132,12 @@ per-type polydispersity is allowed.  This means all particles of the
 same type must have the same diameter.  Each type can have a different
 diameter.
 
+----------
+
+.. include:: accel_styles.rst
+
+----------
+
 Related commands
 """"""""""""""""
 
diff --git a/doc/src/thermo_style.rst b/doc/src/thermo_style.rst
index 63ad59e553..c3c607a479 100644
--- a/doc/src/thermo_style.rst
+++ b/doc/src/thermo_style.rst
@@ -385,19 +385,20 @@ creates a global vector with 6 values.
 The *c_ID* and *c_ID[I]* and *c_ID[I][J]* keywords allow global values
 calculated by a compute to be output.  As discussed on the
 :doc:`compute <compute>` doc page, computes can calculate global,
-per-atom, or local values.  Only global values can be referenced by
-this command.  However, per-atom compute values for an individual atom
-can be referenced in a :doc:`variable <variable>` and the variable
-referenced by thermo_style custom, as discussed below.  See the
-discussion above for how the I in *c_ID[I]* can be specified with a
-wildcard asterisk to effectively specify multiple values from a global
-compute vector.
+per-atom, local, and per-grid values.  Only global values can be
+referenced by this command.  However, per-atom compute values for an
+individual atom can be referenced in a :doc:`equal-style variable
+<variable>` and the variable referenced by thermo_style custom, as
+discussed below.  See the discussion above for how the I in *c_ID[I]*
+can be specified with a wildcard asterisk to effectively specify
+multiple values from a global compute vector.
 
 The ID in the keyword should be replaced by the actual ID of a compute
 that has been defined elsewhere in the input script.  See the
-:doc:`compute <compute>` command for details.  If the compute calculates
-a global scalar, vector, or array, then the keyword formats with 0, 1,
-or 2 brackets will reference a scalar value from the compute.
+:doc:`compute <compute>` command for details.  If the compute
+calculates a global scalar, vector, or array, then the keyword formats
+with 0, 1, or 2 brackets will reference a scalar value from the
+compute.
 
 Note that some computes calculate "intensive" global quantities like
 temperature; others calculate "extensive" global quantities like
@@ -410,13 +411,14 @@ norm <thermo_modify>` option being used.
 
 The *f_ID* and *f_ID[I]* and *f_ID[I][J]* keywords allow global values
 calculated by a fix to be output.  As discussed on the :doc:`fix
-<fix>` doc page, fixes can calculate global, per-atom, or local
-values.  Only global values can be referenced by this command.
-However, per-atom fix values can be referenced for an individual atom
-in a :doc:`variable <variable>` and the variable referenced by
-thermo_style custom, as discussed below.  See the discussion above for
-how the I in *f_ID[I]* can be specified with a wildcard asterisk to
-effectively specify multiple values from a global fix vector.
+<fix>` doc page, fixes can calculate global, per-atom, local, and
+per-grid values.  Only global values can be referenced by this
+command.  However, per-atom fix values can be referenced for an
+individual atom in a :doc:`equal-style variable <variable>` and the
+variable referenced by thermo_style custom, as discussed below.  See
+the discussion above for how the I in *f_ID[I]* can be specified with
+a wildcard asterisk to effectively specify multiple values from a
+global fix vector.
 
 The ID in the keyword should be replaced by the actual ID of a fix
 that has been defined elsewhere in the input script.  See the
@@ -438,14 +440,15 @@ output.  The name in the keyword should be replaced by the variable
 name that has been defined elsewhere in the input script.  Only
 equal-style and vector-style variables can be referenced; the latter
 requires a bracketed term to specify the Ith element of the vector
-calculated by the variable.  However, an atom-style variable can be
-referenced for an individual atom by an equal-style variable and that
-variable referenced.  See the :doc:`variable <variable>` command for
-details.  Variables of style *equal* and *vector* and *atom* define a
-formula which can reference per-atom properties or thermodynamic
-keywords, or they can invoke other computes, fixes, or variables when
-evaluated, so this is a very general means of creating thermodynamic
-output.
+calculated by the variable.  However, an equal-style variable can use
+an atom-style variable in its formula indexed by the ID of an
+individual atom.  This is a way to output a speciic atom's per-atom
+coordinates or other per-atom properties in thermo output.  See the
+:doc:`variable <variable>` command for details.  Note that variables
+of style *equal* and *vector* and *atom* define a formula which can
+reference per-atom properties or thermodynamic keywords, or they can
+invoke other computes, fixes, or variables when evaluated, so this is
+a very general means of creating thermodynamic output.
 
 Note that equal-style and vector-style variables are assumed to
 produce "intensive" global quantities, which are thus printed as-is,
diff --git a/doc/src/variable.rst b/doc/src/variable.rst
index 28c0d29799..f1a316da1f 100644
--- a/doc/src/variable.rst
+++ b/doc/src/variable.rst
@@ -550,12 +550,11 @@ variables.
 Most of the formula elements produce a scalar value.  Some produce a
 global or per-atom vector of values.  Global vectors can be produced
 by computes or fixes or by other vector-style variables.  Per-atom
-vectors are produced by atom vectors, compute references that
-represent a per-atom vector, fix references that represent a per-atom
-vector, and variables that are atom-style variables.  Math functions
-that operate on scalar values produce a scalar value; math function
-that operate on global or per-atom vectors do so element-by-element
-and produce a global or per-atom vector.
+vectors are produced by atom vectors, computes or fixes which output a
+per-atom vector or array, and variables that are atom-style variables.
+Math functions that operate on scalar values produce a scalar value;
+math function that operate on global or per-atom vectors do so
+element-by-element and produce a global or per-atom vector.
 
 A formula for equal-style variables cannot use any formula element
 that produces a global or per-atom vector.  A formula for a
@@ -564,12 +563,13 @@ scalar value or a global vector value, but cannot use a formula
 element that produces a per-atom vector.  A formula for an atom-style
 variable can use formula elements that produce either a scalar value
 or a per-atom vector, but not one that produces a global vector.
+
 Atom-style variables are evaluated by other commands that define a
-:doc:`group <group>` on which they operate, e.g. a :doc:`dump <dump>` or
-:doc:`compute <compute>` or :doc:`fix <fix>` command.  When they invoke
-the atom-style variable, only atoms in the group are included in the
-formula evaluation.  The variable evaluates to 0.0 for atoms not in
-the group.
+:doc:`group <group>` on which they operate, e.g. a :doc:`dump <dump>`
+or :doc:`compute <compute>` or :doc:`fix <fix>` command.  When they
+invoke the atom-style variable, only atoms in the group are included
+in the formula evaluation.  The variable evaluates to 0.0 for atoms
+not in the group.
 
 ----------
 
@@ -1138,69 +1138,74 @@ only defined if an :doc:`atom_style <atom_style>` is being used that
 defines molecule IDs.
 
 Note that many other atom attributes can be used as inputs to a
-variable by using the :doc:`compute property/atom <compute_property_atom>` command and then specifying
-a quantity from that compute.
+variable by using the :doc:`compute property/atom
+<compute_property_atom>` command and then specifying a quantity from
+that compute.
 
 ----------
 
 Compute References
 ------------------
 
-Compute references access quantities calculated by a
-:doc:`compute <compute>`.  The ID in the reference should be replaced by
-the ID of a compute defined elsewhere in the input script.  As
-discussed in the page for the :doc:`compute <compute>` command,
-computes can produce global, per-atom, or local values.  Only global
-and per-atom values can be used in a variable.  Computes can also
-produce a scalar, vector, or array.
+Compute references access quantities calculated by a :doc:`compute
+<compute>`.  The ID in the reference should be replaced by the ID of a
+compute defined elsewhere in the input script.
 
-An equal-style variable can only use scalar values, which means a
-global scalar, or an element of a global or per-atom vector or array.
-A vector-style variable can use scalar values or a global vector of
-values, or a column of a global array of values.  Atom-style variables
-can use global scalar values.  They can also use per-atom vector
-values, or a column of a per-atom array.  See the doc pages for
-individual computes to see what kind of values they produce.
+As discussed on the page for the :doc:`compute <compute>` command,
+computes can produce global, per-atom, local, and per-grid values.
+Only global and per-atom values can be used in a variable.  Computes
+can also produce scalars (global only), vectors, and arrays.  See the
+doc pages for individual computes to see what different kinds of data
+they produce.
 
-Examples of different kinds of compute references are as follows.
-There is typically no ambiguity (see exception below) as to what a
-reference means, since computes only produce either global or per-atom
-quantities, never both.
+An equal-style variable can only use scalar values, either from global
+or per-atom data.  In the case of per-atom data, this would be a value
+for a specific atom.
 
-+-------------+-------------------------------------------------------------------------------------------------------+
-| c_ID       | global scalar, or per-atom vector                                                                      |
-+-------------+-------------------------------------------------------------------------------------------------------+
-| c_ID[I]    | Ith element of global vector, or atom I's value in per-atom vector, or Ith column from per-atom array  |
-+-------------+-------------------------------------------------------------------------------------------------------+
-| c_ID[I][J] | I,J element of global array, or atom I's Jth value in per-atom array                                   |
-+-------------+-------------------------------------------------------------------------------------------------------+
+A vector-style variable can use scalar values (same as for equal-style
+variables), or global vectors of values.  The latter can also be a
+column of a global array.
 
-For I and J indices, integers can be specified or a variable name,
-specified as v_name, where name is the name of the variable.  The
-rules for this syntax are the same as for the "Atom Values and
-Vectors" discussion above.
+Atom-style variables can use scalar values (same as for equal-style
+varaibles), or per-atom vectors of values.  The latter can also be a
+column of a per-atom array.
 
-One source of ambiguity for compute references is when a vector-style
-variable refers to a compute that produces both a global scalar and a
-global vector.  Consider a compute with ID "foo" that does this,
-referenced as follows by variable "a", where "myVec" is another
-vector-style variable:
+The various allowed compute references in the variable formulas for
+equal-, vector-, and atom-style variables are listed in the following
+table:
 
-.. code-block:: LAMMPS
++--------+------------+------------------------------------------+
+| equal  | c_ID       | global scalar                            |
+| equal  | c_ID[I]    | element of global vector                 |
+| equal  | c_ID[I][J] | element of global array                  |
+| equal  | C_ID[I]    | element of per-atom vector (I = atom ID) |
+| equal  | C_ID[I][J] | element of per-atom array (I = atom ID)  |
++--------+------------+------------------------------------------+
+| vector | c_ID       | global vector                            |
+| vector | c_ID[I]    | column of global array                   |
+---------+------------+------------------------------------------+
+| atom   | c_ID       | per-atom vector                          |
+| atom   | c_ID[I]    | column of per-atom array                 |
++--------+------------+------------------------------------------+
 
-   variable a vector c_foo*v_myVec
+Note that if an equal-style variable formula wishes to access per-atom
+data from a compute, it must use capital "C" as the ID prefix and not
+lower-case "c".
 
-The reference "c_foo" could refer to either the global scalar or
-global vector produced by compute "foo".  In this case, "c_foo" will
-always refer to the global scalar, and "C_foo" can be used to
-reference the global vector.  Similarly if the compute produces both a
-global vector and global array, then "c_foo[I]" will always refer to
-an element of the global vector, and "C_foo[I]" can be used to
-reference the Ith column of the global array.
+Also note that if a vector- or atom-style variable formula needs to
+access a scalar value from a compute (i.e. the 5 kinds of values in
+the first 5 lines of the table), it can not do so directly.  Instead,
+it can use a reference to an equal-style variable which stores the
+scalar value from the compute.
 
-Note that if a variable containing a compute is evaluated directly in
-an input script (not during a run), then the values accessed by the
-compute must be current.  See the discussion below about "Variable
+The I and J indices in these compute references can be integers or can
+be a variable name, specified as v_name, where name is the name of the
+variable.  The rules for this syntax are the same as for indices in
+the "Atom Values and Vectors" discussion above.
+
+If a variable containing a compute is evaluated directly in an input
+script (not during a run), then the values accessed by the compute
+should be current.  See the discussion below about "Variable
 Accuracy".
 
 ----------
@@ -1208,51 +1213,59 @@ Accuracy".
 Fix References
 --------------
 
-Fix references access quantities calculated by a :doc:`fix <compute>`.
+Fix references access quantities calculated by a :doc:`fix <fix>`.
 The ID in the reference should be replaced by the ID of a fix defined
-elsewhere in the input script.  As discussed in the page for the
-:doc:`fix <fix>` command, fixes can produce global, per-atom, or local
-values.  Only global and per-atom values can be used in a variable.
-Fixes can also produce a scalar, vector, or array.  An equal-style
-variable can only use scalar values, which means a global scalar, or
-an element of a global or per-atom vector or array.  Atom-style
-variables can use the same scalar values.  They can also use per-atom
-vector values.  A vector value can be a per-atom vector itself, or a
-column of an per-atom array.  See the doc pages for individual fixes
-to see what kind of values they produce.
+elsewhere in the input script.
 
-The different kinds of fix references are exactly the same as the
-compute references listed in the above table, where "c\_" is replaced
-by "f\_".  Again, there is typically no ambiguity (see exception below)
-as to what a reference means, since fixes only produce either global
-or per-atom quantities, never both.
+As discussed on the page for the :doc:`fix <fix>` command, fixes can
+produce global, per-atom, local, and per-grid values.  Only global and
+per-atom values can be used in a variable.  Fixes can also produce
+scalars (global only), vectors, and arrays.  See the doc pages for
+individual fixes to see what different kinds of data they produce.
 
-+-------------+-------------------------------------------------------------------------------------------------------+
-| f_ID       | global scalar, or per-atom vector                                                                      |
-+-------------+-------------------------------------------------------------------------------------------------------+
-| f_ID[I]    | Ith element of global vector, or atom I's value in per-atom vector, or Ith column from per-atom array  |
-+-------------+-------------------------------------------------------------------------------------------------------+
-| f_ID[I][J] | I,J element of global array, or atom I's Jth value in per-atom array                                   |
-+-------------+-------------------------------------------------------------------------------------------------------+
+An equal-style variable can only use scalar values, either from global
+or per-atom data.  In the case of per-atom data, this would be a value
+for a specific atom.
 
-For I and J indices, integers can be specified or a variable name,
-specified as v_name, where name is the name of the variable.  The
-rules for this syntax are the same as for the "Atom Values and
-Vectors" discussion above.
+A vector-style variable can use scalar values (same as for equal-style
+variables), or global vectors of values.  The latter can also be a
+column of a global array.
 
-One source of ambiguity for fix references is the same ambiguity
-discussed for compute references above.  Namely when a vector-style
-variable refers to a fix that produces both a global scalar and a
-global vector.  The solution is the same as for compute references.
-For a fix with ID "foo", "f_foo" will always refer to the global
-scalar, and "F_foo" can be used to reference the global vector.  And
-similarly for distinguishing between a fix's global vector versus
-global array with "f_foo[I]" versus "F_foo[I]".
+Atom-style variables can use scalar values (same as for equal-style
+varaibles), or per-atom vectors of values.  The latter can also be a
+column of a per-atom array.
 
-Note that if a variable containing a fix is evaluated directly in an
-input script (not during a run), then the values accessed by the fix
-should be current.  See the discussion below about "Variable
-Accuracy".
+The allowed fix references in variable formulas for equal-, vector-,
+and atom-style variables are listed in the following table:
+
++--------+------------+------------------------------------------+
+| equal  | f_ID       | global scalar                            |
+| equal  | f_ID[I]    | element of global vector                 |
+| equal  | f_ID[I][J] | element of global array                  |
+| equal  | F_ID[I]    | element of per-atom vector (I = atom ID) |
+| equal  | F_ID[I][J] | element of per-atom array (I = atom ID)  |
++--------+------------+------------------------------------------+
+| vector | f_ID       | global vector                            |
+| vector | f_ID[I]    | column of global array                   |
+---------+------------+------------------------------------------+
+| atom   | f_ID       | per-atom vector                          |
+| atom   | f_ID[I]    | column of per-atom array                 |
++--------+------------+------------------------------------------+
+
+Note that if an equal-style variable formula wishes to access per-atom
+data from a fix, it must use capital "F" as the ID prefix and not
+lower-case "f".
+
+Also note that if a vector- or atom-style variable formula needs to
+access a scalar value from a fix (i.e. the 5 kinds of values in the
+first 5 lines of the table), it can not do so directly.  Instead, it
+can use a reference to an equal-style variable which stores the scalar
+value from the fix.
+
+The I and J indices in these fix references can be integers or can be
+a variable name, specified as v_name, where name is the name of the
+variable.  The rules for this syntax are the same as for indices in
+the "Atom Values and Vectors" discussion above.
 
 Note that some fixes only generate quantities on certain timesteps.
 If a variable attempts to access the fix on non-allowed timesteps, an
@@ -1260,6 +1273,10 @@ error is generated.  For example, the :doc:`fix ave/time <fix_ave_time>`
 command may only generate averaged quantities every 100 steps.  See
 the doc pages for individual fix commands for details.
 
+If a variable containing a fix is evaluated directly in an input
+script (not during a run), then the values accessed by the fix should
+be current.  See the discussion below about "Variable Accuracy".
+
 ----------
 
 Variable References
@@ -1294,26 +1311,32 @@ including other atom-style or atomfile-style variables.  If it uses a
 vector-style variable, a subscript must be used to access a single
 value from the vector-style variable.
 
-Examples of different kinds of variable references are as follows.
-There is no ambiguity as to what a reference means, since variables
-produce only a global scalar or global vector or per-atom vector.
+The allowed variable references in variable formulas for equal-,
+vector-, and atom-style variables are listed in the following table.
+Note that there is no ambiguity as to what a reference means, since
+referenced variables produce only a global scalar or global vector or
+per-atom vector.
 
-+------------+----------------------------------------------------------------------+
-| v_name    | global scalar from equal-style variable                               |
-+------------+----------------------------------------------------------------------+
-| v_name    | global vector from vector-style variable                              |
-+------------+----------------------------------------------------------------------+
-| v_name    | per-atom vector from atom-style or atomfile-style variable            |
-+------------+----------------------------------------------------------------------+
-| v_name[I] | Ith element of a global vector from vector-style variable             |
-+------------+----------------------------------------------------------------------+
-| v_name[I] | value of atom with ID = I from atom-style or atomfile-style variable  |
-+------------+----------------------------------------------------------------------+
++--------+-----------+-----------------------------------------------------------------------------------+
+| equal  | v_name    | global scalar from an equal-style variable                                        |
+| equal  | v_name[I] | element of global vector from a vector-style variable                             |
+| equal  | v_name[I] | element of per-atom vector (I = atom ID) from an atom- or atomfile-style variable |
++--------+-----------+-----------------------------------------------------------------------------------+
+| vector | v_name    | global scalar from an equal-style variable                                        |
+| vector | v_name    | global vector from a vector-style variable                                        |
+| vector | v_name[I] | element of global vector from a vector-style variable                             |
+| vector | v_name[I] | element of per-atom vector (I = atom ID) from an atom- or atomfile-style variable |
++--------+-----------+-----------------------------------------------------------------------------------+
+| atom   | v_name    | global scalar from an equal-style variable                                        |
+| atom   | v_name    | per-atom vector from an atom-style or atomfile-style variable                     |
+| atom   | v_name[I] | element of global vector from a vector-style variable                             |
+| atom   | v_name[I] | element of per-atom vector (I = atom ID) from an atom- or atomfile-style variable |
++--------+-----------+-----------------------------------------------------------------------------------+
 
 For the I index, an integer can be specified or a variable name,
 specified as v_name, where name is the name of the variable.  The
-rules for this syntax are the same as for the "Atom Values and
-Vectors" discussion above.
+rules for this syntax are the same as for indices in the "Atom Values
+and Vectors" discussion above.
 
 ----------
 
diff --git a/examples/mliap/in.mliap.quadratic.compute b/examples/mliap/in.mliap.quadratic.compute
index 929dbf3824..cc9ad331b5 100644
--- a/examples/mliap/in.mliap.quadratic.compute
+++ b/examples/mliap/in.mliap.quadratic.compute
@@ -65,7 +65,7 @@ compute         bsum2 snapgroup2 reduce sum c_b[*]
 # fix 		bsum2 all ave/time 1 1 1 c_bsum2 file bsum2.dat mode vector
 compute		vbsum all reduce sum c_vb[*]
 # fix 		vbsum all ave/time 1 1 1 c_vbsum file vbsum.dat mode vector
-variable	db_2_100 equal c_db[2][100]
+variable	db_2_100 equal C_db[2][100]
 
 # test output:   1: total potential energy
 #                2: xy component of stress tensor
diff --git a/examples/mliap/in.mliap.snap.compute b/examples/mliap/in.mliap.snap.compute
index 4cfccedbdf..c49365f55f 100644
--- a/examples/mliap/in.mliap.snap.compute
+++ b/examples/mliap/in.mliap.snap.compute
@@ -65,7 +65,7 @@ compute         bsum2 snapgroup2 reduce sum c_b[*]
 # fix 		bsum2 all ave/time 1 1 1 c_bsum2 file bsum2.dat mode vector
 compute		vbsum all reduce sum c_vb[*]
 # fix 		vbsum all ave/time 1 1 1 c_vbsum file vbsum.dat mode vector
-variable	db_2_25 equal c_db[2][25]
+variable	db_2_25 equal C_db[2][25]
 
 thermo 		100
 
diff --git a/examples/snap/in.grid.snap b/examples/snap/in.grid.snap
index 08c95a004f..da48957d97 100644
--- a/examples/snap/in.grid.snap
+++ b/examples/snap/in.grid.snap
@@ -67,18 +67,18 @@ compute 	mygridlocal all sna/grid/local grid ${ngrid} ${ngrid} ${ngrid} &
 
 # define output
 
-variable	B5atom equal c_b[2][5]
+variable	B5atom equal C_b[2][5]
 variable	B5grid equal c_mygrid[8][8]
 
 variable	rmse_global equal "sqrt(   &
 	 (c_mygrid[8][1] - x[2])^2 +      &
 	 (c_mygrid[8][2] - y[2])^2 +      &
 	 (c_mygrid[8][3] - z[2])^2 +      &
-	 (c_mygrid[8][4] - c_b[2][1])^2 + &
-	 (c_mygrid[8][5] - c_b[2][2])^2 + &
-	 (c_mygrid[8][6] - c_b[2][3])^2 + &
-	 (c_mygrid[8][7] - c_b[2][4])^2 + &
-	 (c_mygrid[8][8] - c_b[2][5])^2   &
+	 (c_mygrid[8][4] - C_b[2][1])^2 + &
+	 (c_mygrid[8][5] - C_b[2][2])^2 + &
+	 (c_mygrid[8][6] - C_b[2][3])^2 + &
+	 (c_mygrid[8][7] - C_b[2][4])^2 + &
+	 (c_mygrid[8][8] - C_b[2][5])^2   &
 	 )"
 
 thermo_style	custom step v_B5atom v_B5grid v_rmse_global
diff --git a/examples/snap/in.grid.tri b/examples/snap/in.grid.tri
index 5283957eb8..95a14f3bb4 100644
--- a/examples/snap/in.grid.tri
+++ b/examples/snap/in.grid.tri
@@ -87,18 +87,18 @@ compute 	mygridlocal all sna/grid/local grid ${ngridx} ${ngridy} ${ngridz} &
 
 # define output
 
-variable	B5atom equal c_b[7][5]
+variable	B5atom equal C_b[7][5]
 variable	B5grid equal c_mygrid[13][8]
 
 # do not compare x,y,z because assignment of ids
 # to atoms is not unnique for different processor grids
 
 variable	rmse_global equal "sqrt(    &
-	 (c_mygrid[13][4] - c_b[7][1])^2 + &
-	 (c_mygrid[13][5] - c_b[7][2])^2 + &
-	 (c_mygrid[13][6] - c_b[7][3])^2 + &
-	 (c_mygrid[13][7] - c_b[7][4])^2 + &
-	 (c_mygrid[13][8] - c_b[7][5])^2   &
+	 (c_mygrid[13][4] - C_b[7][1])^2 + &
+	 (c_mygrid[13][5] - C_b[7][2])^2 + &
+	 (c_mygrid[13][6] - C_b[7][3])^2 + &
+	 (c_mygrid[13][7] - C_b[7][4])^2 + &
+	 (c_mygrid[13][8] - C_b[7][5])^2   &
 	 )"
 
 thermo_style	custom step v_B5atom v_B5grid v_rmse_global
diff --git a/examples/snap/in.snap.compute b/examples/snap/in.snap.compute
index b0c7314882..8d2ffe8b96 100644
--- a/examples/snap/in.snap.compute
+++ b/examples/snap/in.snap.compute
@@ -70,7 +70,7 @@ compute         bsum2 snapgroup2 reduce sum c_b[*]
 # fix 		bsum2 all ave/time 1 1 1 c_bsum2 file bsum2.dat mode vector
 compute		vbsum all reduce sum c_vb[*]
 # fix 		vbsum all ave/time 1 1 1 c_vbsum file vbsum.dat mode vector
-variable	db_2_25 equal c_db[2][25]
+variable	db_2_25 equal C_db[2][25]
 
 # set up compute snap generating global array
 
diff --git a/examples/snap/in.snap.compute.quadratic b/examples/snap/in.snap.compute.quadratic
index e03d4af3bf..20d5ed3039 100644
--- a/examples/snap/in.snap.compute.quadratic
+++ b/examples/snap/in.snap.compute.quadratic
@@ -70,7 +70,7 @@ compute         bsum2 snapgroup2 reduce sum c_b[*]
 # fix 		bsum2 all ave/time 1 1 1 c_bsum2 file bsum2.dat mode vector
 compute		vbsum all reduce sum c_vb[*]
 # fix 		vbsum all ave/time 1 1 1 c_vbsum file vbsum.dat mode vector
-variable	db_2_100 equal c_db[2][100]
+variable	db_2_100 equal C_db[2][100]
 
 # set up compute snap generating global array
 
diff --git a/examples/voronoi/in.voronoi b/examples/voronoi/in.voronoi
index 5254969fbd..79b6c6efec 100644
--- a/examples/voronoi/in.voronoi
+++ b/examples/voronoi/in.voronoi
@@ -146,10 +146,10 @@ variable i2 equal 257
 compute v1 all voronoi/atom occupation
 compute r0 all   reduce sum c_v1[1]
 compute r1 all   reduce sum c_v1[2]
-variable d5a equal c_v1[${i1}][1]
-variable d5b equal c_v1[${i2}][1]
-variable d5c equal c_v1[${i1}][2]
-variable d5d equal c_v1[${i2}][2]
+variable d5a equal C_v1[${i1}][1]
+variable d5b equal C_v1[${i2}][1]
+variable d5c equal C_v1[${i1}][2]
+variable d5d equal C_v1[${i2}][2]
 thermo_style custom c_r0 c_r1 v_d5a v_d5b v_d5c v_d5d
 
 run 0
diff --git a/examples/voronoi/in.voronoi.data b/examples/voronoi/in.voronoi.data
index 853c2c2bd1..e5d925c498 100644
--- a/examples/voronoi/in.voronoi.data
+++ b/examples/voronoi/in.voronoi.data
@@ -63,11 +63,9 @@ undump          dlocal
 # TEST 2: 
 #
 
-# This compute voronoi generates  
-# local and global quantities, but
-# not per-atom quantities
+# This compute voronoi generates peratom and local and global quantities
 
-compute 	v2 all voronoi/atom neighbors yes edge_histo 6 peratom no
+compute 	v2 all voronoi/atom neighbors yes edge_histo 6
 
 # write voronoi local quantities to a file
 
@@ -75,7 +73,7 @@ dump            d2 all local  1 dump.neighbors2 index c_v2[1] c_v2[2] c_v2[3]
 
 # sum up a voronoi local quantity
 
-compute 	sumarea all reduce sum c_v2[3]
+compute 	sumarea all reduce sum c_v2[3] inputs local
 
 # output voronoi global quantities
 
@@ -83,6 +81,3 @@ thermo_style 	custom c_sumarea c_v2[3] c_v2[4] c_v2[5] c_v2[6] c_v2[7]
 thermo 		1
 
 run  		0
-
-
-
diff --git a/lib/pace/Install.py b/lib/pace/Install.py
index 2a8cd2f1f3..8d31852e44 100644
--- a/lib/pace/Install.py
+++ b/lib/pace/Install.py
@@ -18,11 +18,11 @@ from install_helpers import fullpath, geturl, checkmd5sum, getfallback
 # settings
 
 thisdir = fullpath('.')
-version ='v.2023.10.04.pre'
+version ='v.2023.10.04'
 
 # known checksums for different PACE versions. used to validate the download.
 checksums = { \
-    'v.2023.10.04.pre': '61ba11a37ee00de8365b18b521d394a6'
+    'v.2023.10.04': '70ff79f4e59af175e55d24f3243ad1ff'
 }
 
 parser = ArgumentParser(prog='Install.py', description="LAMMPS library build wrapper script")
diff --git a/src/BOCS/fix_bocs.cpp b/src/BOCS/fix_bocs.cpp
index d17884855a..17bb1af002 100644
--- a/src/BOCS/fix_bocs.cpp
+++ b/src/BOCS/fix_bocs.cpp
@@ -1024,7 +1024,10 @@ void FixBocs::final_integrate()
 
   if (pstat_flag) {
     if (pstyle == ISO) pressure->compute_scalar();
-    else pressure->compute_vector();
+    else {
+      temperature->compute_vector();
+      pressure->compute_vector();
+    }
     couple();
     pressure->addstep(update->ntimestep+1);
   }
@@ -1961,6 +1964,7 @@ void FixBocs::nhc_press_integrate()
   int ich,i,pdof;
   double expfac,factor_etap,kecurrent;
   double kt = boltz * t_target;
+  double lkt_press;
 
   // Update masses, to preserve initial freq, if flag set
 
@@ -2006,7 +2010,8 @@ void FixBocs::nhc_press_integrate()
     }
   }
 
-  double lkt_press = pdof * kt;
+  if (pstyle == ISO) lkt_press = kt;
+  else lkt_press = pdof * kt;
   etap_dotdot[0] = (kecurrent - lkt_press)/etap_mass[0];
 
   double ncfac = 1.0/nc_pchain;
diff --git a/src/Depend.sh b/src/Depend.sh
index b88c527b55..dbffb2dba0 100755
--- a/src/Depend.sh
+++ b/src/Depend.sh
@@ -64,6 +64,7 @@ fi
 
 if (test $1 = "COLLOID") then
   depend GPU
+  depend KOKKOS
   depend OPENMP
 fi
 
diff --git a/src/INTEL/fix_intel.cpp b/src/INTEL/fix_intel.cpp
index 0a3d27a978..cb60149885 100644
--- a/src/INTEL/fix_intel.cpp
+++ b/src/INTEL/fix_intel.cpp
@@ -20,6 +20,7 @@
 #include "fix_intel.h"
 
 #include "comm.h"
+#include "domain.h"
 #include "error.h"
 #include "force.h"
 #include "neighbor.h"
@@ -470,6 +471,7 @@ void FixIntel::pair_init_check(const bool cdmessage)
 
   int need_tag = 0;
   if (atom->molecular != Atom::ATOMIC || three_body_neighbor()) need_tag = 1;
+  if (domain->triclinic && force->newton_pair) need_tag = 1;
 
   // Clear buffers used for pair style
   char kmode[80];
diff --git a/src/INTEL/npair_halffull_newton_intel.cpp b/src/INTEL/npair_halffull_newton_intel.cpp
index cd05d5f97a..adcf2527ab 100644
--- a/src/INTEL/npair_halffull_newton_intel.cpp
+++ b/src/INTEL/npair_halffull_newton_intel.cpp
@@ -20,7 +20,9 @@
 
 #include "atom.h"
 #include "comm.h"
+#include "domain.h"
 #include "error.h"
+#include "force.h"
 #include "modify.h"
 #include "my_page.h"
 #include "neigh_list.h"
@@ -56,6 +58,9 @@ void NPairHalffullNewtonIntel::build_t(NeighList *list,
   const int * _noalias const numneigh_full = list->listfull->numneigh;
   const int ** _noalias const firstneigh_full = (const int ** const)list->listfull->firstneigh;  // NOLINT
 
+  const double delta = 0.01 * force->angstrom;
+  const int triclinic = domain->triclinic;
+
   #if defined(_OPENMP)
   #pragma omp parallel
   #endif
@@ -82,25 +87,50 @@ void NPairHalffullNewtonIntel::build_t(NeighList *list,
       const int * _noalias const jlist = firstneigh_full[i];
       const int jnum = numneigh_full[i];
 
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma vector aligned
-      #pragma ivdep
-      #endif
-      for (int jj = 0; jj < jnum; jj++) {
-        const int joriginal = jlist[jj];
-        const int j = joriginal & NEIGHMASK;
-        int addme = 1;
-        if (j < nlocal) {
-          if (i > j) addme = 0;
-        } else {
-          if (x[j].z < ztmp) addme = 0;
-          if (x[j].z == ztmp) {
-            if (x[j].y < ytmp) addme = 0;
-            if (x[j].y == ytmp && x[j].x < xtmp) addme = 0;
+      if (!triclinic) {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int joriginal = jlist[jj];
+          const int j = joriginal & NEIGHMASK;
+          int addme = 1;
+          if (j < nlocal) {
+            if (i > j) addme = 0;
+          } else {
+            if (x[j].z < ztmp) addme = 0;
+            if (x[j].z == ztmp) {
+              if (x[j].y < ytmp) addme = 0;
+              if (x[j].y == ytmp && x[j].x < xtmp) addme = 0;
+            }
           }
+          if (addme)
+            neighptr[n++] = joriginal;
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int joriginal = jlist[jj];
+          const int j = joriginal & NEIGHMASK;
+          int addme = 1;
+          if (j < nlocal) {
+            if (i > j) addme = 0;
+          } else {
+            if (fabs(x[j].z-ztmp) > delta) {
+              if (x[j].z < ztmp) addme = 0;
+            } else if (fabs(x[j].y-ytmp) > delta) {
+              if (x[j].y < ytmp) addme = 0;
+            } else {
+              if (x[j].x < xtmp) addme = 0;
+            }
+          }
+          if (addme)
+            neighptr[n++] = joriginal;
         }
-        if (addme)
-          neighptr[n++] = joriginal;
       }
 
       ilist[ii] = i;
@@ -203,7 +233,7 @@ void NPairHalffullNewtonIntel::build_t3(NeighList *list, int *numhalf)
 
 void NPairHalffullNewtonIntel::build(NeighList *list)
 {
-  if (_fix->three_body_neighbor() == 0) {
+  if (_fix->three_body_neighbor() == 0 || domain->triclinic) {
     if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
       build_t(list, _fix->get_mixed_buffers());
     else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
diff --git a/src/INTEL/npair_halffull_newton_trim_intel.cpp b/src/INTEL/npair_halffull_newton_trim_intel.cpp
index e38375f750..34b9b20e9c 100644
--- a/src/INTEL/npair_halffull_newton_trim_intel.cpp
+++ b/src/INTEL/npair_halffull_newton_trim_intel.cpp
@@ -20,7 +20,9 @@
 
 #include "atom.h"
 #include "comm.h"
+#include "domain.h"
 #include "error.h"
+#include "force.h"
 #include "modify.h"
 #include "my_page.h"
 #include "neigh_list.h"
@@ -57,6 +59,8 @@ void NPairHalffullNewtonTrimIntel::build_t(NeighList *list,
   const int ** _noalias const firstneigh_full = (const int ** const)list->listfull->firstneigh;  // NOLINT
 
   const flt_t cutsq_custom = cutoff_custom * cutoff_custom;
+  const double delta = 0.01 * force->angstrom;
+  const int triclinic = domain->triclinic;
 
   #if defined(_OPENMP)
   #pragma omp parallel
@@ -84,35 +88,70 @@ void NPairHalffullNewtonTrimIntel::build_t(NeighList *list,
       const int * _noalias const jlist = firstneigh_full[i];
       const int jnum = numneigh_full[i];
 
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma vector aligned
-      #pragma ivdep
-      #endif
-      for (int jj = 0; jj < jnum; jj++) {
-        const int joriginal = jlist[jj];
-        const int j = joriginal & NEIGHMASK;
-        int addme = 1;
-        if (j < nlocal) {
-          if (i > j) addme = 0;
-        } else {
-          if (x[j].z < ztmp) addme = 0;
-          if (x[j].z == ztmp) {
-            if (x[j].y < ytmp) addme = 0;
-            if (x[j].y == ytmp && x[j].x < xtmp) addme = 0;
+      if (!triclinic) {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int joriginal = jlist[jj];
+          const int j = joriginal & NEIGHMASK;
+          int addme = 1;
+          if (j < nlocal) {
+            if (i > j) addme = 0;
+          } else {
+            if (x[j].z < ztmp) addme = 0;
+            if (x[j].z == ztmp) {
+              if (x[j].y < ytmp) addme = 0;
+              if (x[j].y == ytmp && x[j].x < xtmp) addme = 0;
+            }
           }
+
+          // trim to shorter cutoff
+
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+
+          if (rsq > cutsq_custom) addme = 0;
+
+          if (addme)
+            neighptr[n++] = joriginal;
         }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int joriginal = jlist[jj];
+          const int j = joriginal & NEIGHMASK;
+          int addme = 1;
+          if (j < nlocal) {
+            if (i > j) addme = 0;
+          } else {
+            if (fabs(x[j].z-ztmp) > delta) {
+              if (x[j].z < ztmp) addme = 0;
+            } else if (fabs(x[j].y-ytmp) > delta) {
+              if (x[j].y < ytmp) addme = 0;
+            } else {
+              if (x[j].x < xtmp) addme = 0;
+            }
+          }
 
-        // trim to shorter cutoff
+          // trim to shorter cutoff
 
-        const flt_t delx = xtmp - x[j].x;
-        const flt_t dely = ytmp - x[j].y;
-        const flt_t delz = ztmp - x[j].z;
-        const flt_t rsq = delx * delx + dely * dely + delz * delz;
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
 
-        if (rsq > cutsq_custom) addme = 0;
+          if (rsq > cutsq_custom) addme = 0;
 
-        if (addme)
-          neighptr[n++] = joriginal;
+          if (addme)
+            neighptr[n++] = joriginal;
+        }
       }
 
       ilist[ii] = i;
@@ -235,7 +274,7 @@ void NPairHalffullNewtonTrimIntel::build_t3(NeighList *list, int *numhalf,
 
 void NPairHalffullNewtonTrimIntel::build(NeighList *list)
 {
-  if (_fix->three_body_neighbor() == 0) {
+  if (_fix->three_body_neighbor() == 0 || domain->triclinic) {
     if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
       build_t(list, _fix->get_mixed_buffers());
     else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
diff --git a/src/INTEL/npair_intel.cpp b/src/INTEL/npair_intel.cpp
index 600109d7ae..dcfb66e05f 100644
--- a/src/INTEL/npair_intel.cpp
+++ b/src/INTEL/npair_intel.cpp
@@ -204,6 +204,8 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
   }
   const int special_bound = sb;
 
+  const double delta = 0.01 * force->angstrom;
+
   #ifdef _LMP_INTEL_OFFLOAD
   const int * _noalias const binhead = this->binhead;
   const int * _noalias const bins = this->bins;
@@ -229,7 +231,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
     in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
     in(offload_end,separate_buffers,astart,aend,nlocal,molecular) \
     in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
-    in(pack_width,special_bound)                                        \
+    in(pack_width,special_bound,delta)                                  \
     out(overflow:length(5) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     signal(tag)
@@ -331,7 +333,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
         const flt_t ztmp = x[i].z;
         const int itype = x[i].w;
         tagint itag;
-        if (THREE) itag = tag[i];
+        if (THREE || (TRI && !FULL)) itag = tag[i];
         const int ioffset = ntypes * itype;
 
         const int ibin = atombin[i];
@@ -365,7 +367,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
             ty[u] = x[j].y;
             tz[u] = x[j].z;
             tjtype[u] = x[j].w;
-            if (THREE) ttag[u] = tag[j];
+            if (THREE || (TRI && !FULL)) ttag[u] = tag[j];
           }
 
           if (FULL == 0 && TRI != 1) {
@@ -486,12 +488,32 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
 
           // Triclinic
           if (TRI) {
-            if (tz[u] < ztmp) addme = 0;
-            if (tz[u] == ztmp) {
-              if (ty[u] < ytmp) addme = 0;
-              if (ty[u] == ytmp) {
-                if (tx[u] < xtmp) addme = 0;
-                if (tx[u] == xtmp && j <= i) addme = 0;
+            if (FULL) {
+              if (tz[u] < ztmp) addme = 0;
+              if (tz[u] == ztmp) {
+                if (ty[u] < ytmp) addme = 0;
+                if (ty[u] == ytmp) {
+                  if (tx[u] < xtmp) addme = 0;
+                  if (tx[u] == xtmp && j <= i) addme = 0;
+                }
+              }
+            } else {
+              if (j <= i) addme = 0;
+              if (j >= nlocal) {
+                const tagint jtag = ttag[u];
+                if (itag > jtag) {
+                  if ((itag+jtag) % 2 == 0) addme = 0;
+                } else if (itag < jtag) {
+                  if ((itag+jtag) % 2 == 1) addme = 0;
+                } else {
+                  if (fabs(tz[u]-ztmp) > delta) {
+                    if (tz[u] < ztmp) addme = 0;
+                  } else if (fabs(ty[u]-ytmp) > delta) {
+                    if (ty[u] < ytmp) addme = 0;
+                  } else {
+                    if (tx[u] < xtmp) addme = 0;
+                  }
+                }
               }
             }
           }
diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index d44ed1c981..489efc55a0 100755
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -129,6 +129,8 @@ action fix_dt_reset_kokkos.cpp
 action fix_dt_reset_kokkos.h
 action fix_enforce2d_kokkos.cpp
 action fix_enforce2d_kokkos.h
+action fix_efield_kokkos.cpp
+action fix_efield_kokkos.h
 action fix_eos_table_rx_kokkos.cpp fix_eos_table_rx.cpp
 action fix_eos_table_rx_kokkos.h fix_eos_table_rx.h
 action fix_freeze_kokkos.cpp fix_freeze.cpp
@@ -173,6 +175,8 @@ action fix_shake_kokkos.cpp fix_shake.cpp
 action fix_shake_kokkos.h fix_shake.h
 action fix_shardlow_kokkos.cpp fix_shardlow.cpp
 action fix_shardlow_kokkos.h fix_shardlow.h
+action fix_spring_self_kokkos.cpp
+action fix_spring_self_kokkos.h
 action fix_viscous_kokkos.cpp
 action fix_viscous_kokkos.h
 action fix_wall_gran_kokkos.cpp fix_wall_gran.cpp
@@ -363,6 +367,8 @@ action pair_vashishta_kokkos.cpp pair_vashishta.cpp
 action pair_vashishta_kokkos.h pair_vashishta.h
 action pair_yukawa_kokkos.cpp
 action pair_yukawa_kokkos.h
+action pair_yukawa_colloid_kokkos.cpp pair_yukawa_colloid.cpp
+action pair_yukawa_colloid_kokkos.h pair_yukawa_colloid.h
 action pair_zbl_kokkos.cpp
 action pair_zbl_kokkos.h
 action pppm_kokkos.cpp pppm.cpp
diff --git a/src/KOKKOS/atom_kokkos.cpp b/src/KOKKOS/atom_kokkos.cpp
index 03537e7b88..bc393b29d8 100644
--- a/src/KOKKOS/atom_kokkos.cpp
+++ b/src/KOKKOS/atom_kokkos.cpp
@@ -44,6 +44,9 @@ AtomKokkos::AtomKokkos(LAMMPS *lmp) : Atom(lmp)
 
   h_tag_min = Kokkos::subview(h_tag_min_max,0);
   h_tag_max = Kokkos::subview(h_tag_min_max,1);
+
+  nprop_atom = 0;
+  fix_prop_atom = nullptr;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -112,6 +115,7 @@ AtomKokkos::~AtomKokkos()
 
   memoryKK->destroy_kokkos(k_dvector, dvector);
   dvector = nullptr;
+  delete [] fix_prop_atom;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -125,11 +129,37 @@ void AtomKokkos::init()
 
 /* ---------------------------------------------------------------------- */
 
+void AtomKokkos::update_property_atom()
+{
+  nprop_atom = 0;
+  std::vector<Fix *> prop_atom_fixes;
+  for (auto &ifix : modify->get_fix_by_style("^property/atom")) {
+    if (!ifix->kokkosable)
+      error->all(FLERR, "KOKKOS package requires a Kokkos-enabled version of fix property/atom");
+
+    ++nprop_atom;
+    prop_atom_fixes.push_back(ifix);
+  }
+
+  delete[] fix_prop_atom;
+  fix_prop_atom = new FixPropertyAtomKokkos *[nprop_atom];
+
+  int n = 0;
+  for (auto &ifix : prop_atom_fixes)
+    fix_prop_atom[n++] = dynamic_cast<FixPropertyAtomKokkos *>(ifix);
+}
+
+/* ---------------------------------------------------------------------- */
+
 void AtomKokkos::sync(const ExecutionSpace space, unsigned int mask)
 {
-  if (space == Device && lmp->kokkos->auto_sync) avecKK->modified(Host, mask);
+  if (space == Device && lmp->kokkos->auto_sync) {
+    avecKK->modified(Host, mask);
+    for (int n = 0; n < nprop_atom; n++) fix_prop_atom[n]->modified(Host, mask);
+  }
 
   avecKK->sync(space, mask);
+  for (int n = 0; n < nprop_atom; n++) fix_prop_atom[n]->sync(space, mask);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -137,13 +167,20 @@ void AtomKokkos::sync(const ExecutionSpace space, unsigned int mask)
 void AtomKokkos::modified(const ExecutionSpace space, unsigned int mask)
 {
   avecKK->modified(space, mask);
+  for (int n = 0; n < nprop_atom; n++) fix_prop_atom[n]->modified(space, mask);
 
-  if (space == Device && lmp->kokkos->auto_sync) avecKK->sync(Host, mask);
+  if (space == Device && lmp->kokkos->auto_sync) {
+    avecKK->sync(Host, mask);
+    for (int n = 0; n < nprop_atom; n++) fix_prop_atom[n]->sync(Host, mask);
+  }
 }
 
+/* ---------------------------------------------------------------------- */
+
 void AtomKokkos::sync_overlapping_device(const ExecutionSpace space, unsigned int mask)
 {
   avecKK->sync_overlapping_device(space, mask);
+  for (int n = 0; n < nprop_atom; n++) fix_prop_atom[n]->sync_overlapping_device(space, mask);
 }
 /* ---------------------------------------------------------------------- */
 
@@ -375,7 +412,7 @@ AtomVec *AtomKokkos::new_avec(const std::string &style, int trysuffix, int &sfla
   int hybrid_substyle_flag = (avec != nullptr);
 
   AtomVec *avec = Atom::new_avec(style, trysuffix, sflag);
-  if (!avec->kokkosable) error->all(FLERR, "KOKKOS package requires a kokkos enabled atom_style");
+  if (!avec->kokkosable) error->all(FLERR, "KOKKOS package requires a Kokkos-enabled atom_style");
 
   if (!hybrid_substyle_flag)
     avecKK = dynamic_cast<AtomVecKokkos*>(avec);
diff --git a/src/KOKKOS/atom_kokkos.h b/src/KOKKOS/atom_kokkos.h
index f8b00f21f2..21a9aeebbd 100644
--- a/src/KOKKOS/atom_kokkos.h
+++ b/src/KOKKOS/atom_kokkos.h
@@ -14,6 +14,7 @@
 
 #include "atom.h"               // IWYU pragma: export
 #include "kokkos_type.h"
+#include "fix_property_atom_kokkos.h"
 
 #include <Kokkos_Sort.hpp>
 
@@ -25,6 +26,8 @@ namespace LAMMPS_NS {
 class AtomKokkos : public Atom {
  public:
   bool sort_classic;
+  int nprop_atom;
+  FixPropertyAtomKokkos** fix_prop_atom;
 
   DAT::tdual_tagint_1d k_tag;
   DAT::tdual_int_1d k_type, k_mask;
@@ -144,6 +147,7 @@ class AtomKokkos : public Atom {
   }
 
   void init() override;
+  void update_property_atom();
   void allocate_type_arrays() override;
   void sync(const ExecutionSpace space, unsigned int mask);
   void modified(const ExecutionSpace space, unsigned int mask);
diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.cpp b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
index a8ce29f666..c3430b9f6e 100644
--- a/src/KOKKOS/atom_vec_dpd_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
@@ -963,7 +963,6 @@ void AtomVecDPDKokkos::sync(ExecutionSpace space, unsigned int mask)
     if (mask & UCG_MASK) atomKK->k_uCG.sync<LMPDeviceType>();
     if (mask & UCGNEW_MASK) atomKK->k_uCGnew.sync<LMPDeviceType>();
     if (mask & DUCHEM_MASK) atomKK->k_duChem.sync<LMPDeviceType>();
-    if (mask & DVECTOR_MASK) atomKK->k_dvector.sync<LMPDeviceType>();
   } else {
     if (mask & X_MASK) atomKK->k_x.sync<LMPHostType>();
     if (mask & V_MASK) atomKK->k_v.sync<LMPHostType>();
@@ -980,7 +979,6 @@ void AtomVecDPDKokkos::sync(ExecutionSpace space, unsigned int mask)
     if (mask & UCG_MASK) atomKK->k_uCG.sync<LMPHostType>();
     if (mask & UCGNEW_MASK) atomKK->k_uCGnew.sync<LMPHostType>();
     if (mask & DUCHEM_MASK) atomKK->k_duChem.sync<LMPHostType>();
-    if (mask & DVECTOR_MASK) atomKK->k_dvector.sync<LMPHostType>();
   }
 }
 
@@ -1019,8 +1017,6 @@ void AtomVecDPDKokkos::sync_overlapping_device(ExecutionSpace space, unsigned in
       perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uCGnew,space);
     if ((mask & DUCHEM_MASK) && atomKK->k_duChem.need_sync<LMPDeviceType>())
       perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_duChem,space);
-    if ((mask & DVECTOR_MASK) && atomKK->k_dvector.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_float_2d>(atomKK->k_dvector,space);
   } else {
     if ((mask & X_MASK) && atomKK->k_x.need_sync<LMPHostType>())
       perform_async_copy<DAT::tdual_x_array>(atomKK->k_x,space);
@@ -1052,8 +1048,6 @@ void AtomVecDPDKokkos::sync_overlapping_device(ExecutionSpace space, unsigned in
       perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uCGnew,space);
     if ((mask & DUCHEM_MASK) && atomKK->k_duChem.need_sync<LMPHostType>())
       perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_duChem,space);
-    if ((mask & DVECTOR_MASK) && atomKK->k_dvector.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_float_2d>(atomKK->k_dvector,space);
   }
 }
 
@@ -1077,7 +1071,6 @@ void AtomVecDPDKokkos::modified(ExecutionSpace space, unsigned int mask)
     if (mask & UCG_MASK) atomKK->k_uCG.modify<LMPDeviceType>();
     if (mask & UCGNEW_MASK) atomKK->k_uCGnew.modify<LMPDeviceType>();
     if (mask & DUCHEM_MASK) atomKK->k_duChem.modify<LMPDeviceType>();
-    if (mask & DVECTOR_MASK) atomKK->k_dvector.modify<LMPDeviceType>();
   } else {
     if (mask & X_MASK) atomKK->k_x.modify<LMPHostType>();
     if (mask & V_MASK) atomKK->k_v.modify<LMPHostType>();
@@ -1094,6 +1087,5 @@ void AtomVecDPDKokkos::modified(ExecutionSpace space, unsigned int mask)
     if (mask & UCG_MASK) atomKK->k_uCG.modify<LMPHostType>();
     if (mask & UCGNEW_MASK) atomKK->k_uCGnew.modify<LMPHostType>();
     if (mask & DUCHEM_MASK) atomKK->k_duChem.modify<LMPHostType>();
-    if (mask & DVECTOR_MASK) atomKK->k_dvector.modify<LMPHostType>();
   }
 }
diff --git a/src/KOKKOS/atom_vec_kokkos.h b/src/KOKKOS/atom_vec_kokkos.h
index d3b2578b68..c10ff5b40a 100644
--- a/src/KOKKOS/atom_vec_kokkos.h
+++ b/src/KOKKOS/atom_vec_kokkos.h
@@ -139,6 +139,8 @@ class AtomVecKokkos : virtual public AtomVec {
 
   DAT::tdual_int_1d k_count;
 
+ public:
+
   #ifdef LMP_KOKKOS_GPU
   template<class ViewType>
   Kokkos::View<typename ViewType::data_type,
diff --git a/src/KOKKOS/fix_dt_reset_kokkos.cpp b/src/KOKKOS/fix_dt_reset_kokkos.cpp
index f3435e711e..4c7545cee0 100644
--- a/src/KOKKOS/fix_dt_reset_kokkos.cpp
+++ b/src/KOKKOS/fix_dt_reset_kokkos.cpp
@@ -113,7 +113,7 @@ void FixDtResetKokkos<DeviceType>::end_of_step()
    update->dt = dt;
    update->dt_default = 0;
    if (force->pair) force->pair->reset_dt();
-   for (int i = 0; i < modify->nfix; i++) modify->fix[i]->reset_dt();
+   for (auto &ifix : modify->get_fix_list()) ifix->reset_dt();
    output->reset_dt();
 
 }
diff --git a/src/KOKKOS/fix_efield_kokkos.cpp b/src/KOKKOS/fix_efield_kokkos.cpp
new file mode 100644
index 0000000000..ffe1c34e97
--- /dev/null
+++ b/src/KOKKOS/fix_efield_kokkos.cpp
@@ -0,0 +1,316 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Trung Nguyen (U Chicago)
+------------------------------------------------------------------------- */
+
+#include "fix_efield_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "update.h"
+#include "modify.h"
+#include "domain_kokkos.h"
+#include "region.h"
+#include "input.h"
+#include "variable.h"
+#include "memory_kokkos.h"
+#include "error.h"
+#include "atom_masks.h"
+#include "kokkos_base.h"
+
+#include <cstring>
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+enum{NONE,CONSTANT,EQUAL,ATOM};
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixEfieldKokkos<DeviceType>::FixEfieldKokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixEfield(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  memory->destroy(efield);
+  memoryKK->create_kokkos(k_efield,efield,maxatom,4,"efield:efield");
+  d_efield = k_efield.view<DeviceType>();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixEfieldKokkos<DeviceType>::~FixEfieldKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_efield,efield);
+  efield = nullptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixEfieldKokkos<DeviceType>::init()
+{
+  FixEfield::init();
+
+  if (utils::strmatch(update->integrate_style,"^respa"))
+    error->all(FLERR,"Cannot (yet) use respa with Kokkos");
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixEfieldKokkos<DeviceType>::post_force(int /*vflag*/)
+{
+  atomKK->sync(execution_space, X_MASK | F_MASK | Q_MASK | IMAGE_MASK | MASK_MASK);
+
+  x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  q = atomKK->k_q.view<DeviceType>();
+  image = atomKK->k_image.view<DeviceType>();
+  mask = atomKK->k_mask.view<DeviceType>();
+
+  int nlocal = atom->nlocal;
+
+  // update region if necessary
+
+  if (region) {
+    if (!utils::strmatch(region->style, "^block"))
+      error->all(FLERR,"Cannot (yet) use {}-style region with fix efield/kk",region->style);
+    region->prematch();
+    DAT::tdual_int_1d k_match = DAT::tdual_int_1d("efield:k_match",nlocal);
+    KokkosBase* regionKKBase = dynamic_cast<KokkosBase*>(region);
+    regionKKBase->match_all_kokkos(groupbit,k_match);
+    k_match.template sync<DeviceType>();
+    d_match = k_match.template view<DeviceType>();
+  }
+
+  // reallocate sforce array if necessary
+
+  if (varflag == ATOM && atom->nmax > maxatom) {
+    maxatom = atom->nmax;
+    memoryKK->destroy_kokkos(k_efield,efield);
+    memoryKK->create_kokkos(k_efield,efield,maxatom,4,"efield:efield");
+    d_efield = k_efield.view<DeviceType>();
+  }
+
+  fsum[0] = fsum[1] = fsum[2] = fsum[3] = 0.0;
+  double_4 fsum_kk;
+  force_flag = 0;
+
+  if (varflag == CONSTANT) {
+    copymode = 1;
+
+    // It would be more concise to use the operators below, but there is still an issue with unwrap (TODO below)
+    //Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagFixEfieldConstant>(0,nlocal),*this,fsum_kk);
+
+    {
+    // local variables for lambda capture
+    auto prd = Few<double,3>(domain->prd);
+    auto h = Few<double,6>(domain->h);
+    auto triclinic = domain->triclinic;
+    auto l_ex = ex;
+    auto l_ey = ey;
+    auto l_ez = ez;
+
+    auto l_x = x;
+    auto l_q = q;
+    auto l_f = f;
+    auto l_mask = mask;
+    auto l_image = image;
+    auto l_groupbit = groupbit;
+
+    Kokkos::parallel_reduce(nlocal, LAMMPS_LAMBDA(const int& i, double_4& fsum_kk) {
+      if (l_mask[i] & l_groupbit) {
+        Few<double,3> x_i;
+        x_i[0] = l_x(i,0);
+        x_i[1] = l_x(i,1);
+        x_i[2] = l_x(i,2);
+        auto unwrap = DomainKokkos::unmap(prd,h,triclinic,x_i,l_image(i));
+        auto qtmp = l_q(i);
+        auto fx = qtmp * l_ex;
+        auto fy = qtmp * l_ey;
+        auto fz = qtmp * l_ez;
+        l_f(i,0) += fx;
+        l_f(i,1) += fy;
+        l_f(i,2) += fz;
+        fsum_kk.d0 -= fx * unwrap[0] + fy * unwrap[1] + fz * unwrap[2];
+        fsum_kk.d1 += fx;
+        fsum_kk.d2 += fy;
+        fsum_kk.d3 += fz;
+      }
+    },fsum_kk);
+    }
+
+    copymode = 0;
+
+  // variable force, wrap with clear/add
+
+  } else {
+
+    atomKK->sync(Host,ALL_MASK); // this can be removed when variable class is ported to Kokkos
+
+    modify->clearstep_compute();
+
+    if (xstyle == EQUAL) ex = input->variable->compute_equal(xvar);
+    else if (xstyle == ATOM)
+      input->variable->compute_atom(xvar,igroup,&efield[0][0],4,0);
+    if (ystyle == EQUAL) ey = input->variable->compute_equal(yvar);
+    else if (ystyle == ATOM)
+      input->variable->compute_atom(yvar,igroup,&efield[0][1],4,0);
+    if (zstyle == EQUAL) ez = input->variable->compute_equal(zvar);
+    else if (zstyle == ATOM)
+      input->variable->compute_atom(zvar,igroup,&efield[0][2],4,0);
+
+    modify->addstep_compute(update->ntimestep + 1);
+
+    if (varflag == ATOM) {  // this can be removed when variable class is ported to Kokkos
+      k_efield.modify<LMPHostType>();
+      k_efield.sync<DeviceType>();
+    }
+
+    copymode = 1;
+    // It would be more concise to use the operators below, but there is still an issue with unwrap (TODO below)
+    //Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagFixEfieldNonConstant>(0,nlocal),*this,fsum_kk);
+    {
+    // local variables for lambda capture
+    auto prd = Few<double,3>(domain->prd);
+    auto h = Few<double,6>(domain->h);
+    auto triclinic = domain->triclinic;
+    auto l_ex = ex;
+    auto l_ey = ey;
+    auto l_ez = ez;
+    auto l_d_efield = d_efield;
+
+    auto l_x = x;
+    auto l_q = q;
+    auto l_f = f;
+    auto l_mask = mask;
+    auto l_image = image;
+    auto l_groupbit = groupbit;
+    auto l_xstyle = xstyle;
+    auto l_ystyle = ystyle;
+    auto l_zstyle = zstyle;
+
+    Kokkos::parallel_reduce(nlocal, LAMMPS_LAMBDA(const int& i, double_4& fsum_kk) {
+      if (l_mask[i] & l_groupbit) {
+        Few<double,3> x_i;
+        x_i[0] = l_x(i,0);
+        x_i[1] = l_x(i,1);
+        x_i[2] = l_x(i,2);
+        auto unwrap = DomainKokkos::unmap(prd,h,triclinic,x_i,l_image(i));
+        auto qtmp = l_q(i);
+        auto fx = qtmp * l_ex;
+        auto fy = qtmp * l_ey;
+        auto fz = qtmp * l_ez;
+        if (l_xstyle == ATOM) l_f(i,0) += qtmp * l_d_efield(i,0);
+        else if (l_xstyle) l_f(i,0) += fx;
+        if (l_ystyle == ATOM) l_f(i,1) += qtmp * l_d_efield(i,1);
+        else if (l_ystyle) l_f(i,1) += fy;
+        if (l_zstyle == ATOM) l_f(i,2) += qtmp * l_d_efield(i,2);
+        else if (l_zstyle) l_f(i,2) += fz;
+        fsum_kk.d0 -= fx * unwrap[0] + fy * unwrap[1] + fz * unwrap[2];
+        fsum_kk.d1 += fx;
+        fsum_kk.d2 += fy;
+        fsum_kk.d3 += fz;
+      }
+    },fsum_kk);
+    }
+
+    copymode = 0;
+  }
+
+  atomKK->modified(execution_space, F_MASK);
+
+  fsum[0] = fsum_kk.d0;
+  fsum[1] = fsum_kk.d1;
+  fsum[2] = fsum_kk.d2;
+  fsum[3] = fsum_kk.d3;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixEfieldKokkos<DeviceType>::operator()(TagFixEfieldConstant, const int &i, double_4& fsum_kk) const {
+  if (mask[i] & groupbit) {
+    if (region && !d_match[i]) return;
+
+    auto prd = Few<double,3>(domain->prd);
+    auto h = Few<double,6>(domain->h);
+    auto triclinic = domain->triclinic;
+    Few<double,3> x_i;
+    x_i[0] = x(i,0);
+    x_i[1] = x(i,1);
+    x_i[2] = x(i,2);
+    auto unwrap = DomainKokkos::unmap(prd,h,triclinic,x_i,image(i));
+    const F_FLOAT qtmp = q(i);
+    const F_FLOAT fx = qtmp * ex;
+    const F_FLOAT fy = qtmp * ey;
+    const F_FLOAT fz = qtmp * ez;
+    f(i,0) += fx;
+    f(i,1) += fy;
+    f(i,2) += fz;
+    // TODO: access to unwrap below crashes
+    fsum_kk.d0 -= fx * unwrap[0] + fy * unwrap[1] + fz * unwrap[2];
+    fsum_kk.d1 += fx;
+    fsum_kk.d2 += fy;
+    fsum_kk.d3 += fz;
+  }
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixEfieldKokkos<DeviceType>::operator()(TagFixEfieldNonConstant, const int &i, double_4& fsum_kk) const {
+  auto prd = Few<double,3>(domain->prd);
+  auto h = Few<double,6>(domain->h);
+  auto triclinic = domain->triclinic;
+  if (mask[i] & groupbit) {
+    if (region && !d_match[i]) return;
+    Few<double,3> x_i;
+    x_i[0] = x(i,0);
+    x_i[1] = x(i,1);
+    x_i[2] = x(i,2);
+    auto unwrap = DomainKokkos::unmap(prd,h,triclinic,x_i,image(i));
+    const F_FLOAT qtmp = q[i];
+    const F_FLOAT fx = qtmp * ex;
+    const F_FLOAT fy = qtmp * ey;
+    const F_FLOAT fz = qtmp * ez;
+    if (xstyle == ATOM) f(i,0) += d_efield(i,0);
+    else if (xstyle) f(i,0) += fx;
+    if (ystyle == ATOM) f(i,1) += d_efield(i,1);
+    else if (ystyle) f(i,1) += fy;
+    if (zstyle == ATOM) f(i,2) += d_efield(i,2);
+    else if (zstyle) f(i,2) += fz;
+    // TODO: access to unwrap below crashes
+    fsum_kk.d0 -= fx * unwrap[0] + fy * unwrap[1] + fz * unwrap[2];
+    fsum_kk.d1 += fx;
+    fsum_kk.d2 += fy;
+    fsum_kk.d3 += fz;
+  }
+}
+
+namespace LAMMPS_NS {
+template class FixEfieldKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class FixEfieldKokkos<LMPHostType>;
+#endif
+}
+
diff --git a/src/KOKKOS/fix_efield_kokkos.h b/src/KOKKOS/fix_efield_kokkos.h
new file mode 100644
index 0000000000..d159473d1d
--- /dev/null
+++ b/src/KOKKOS/fix_efield_kokkos.h
@@ -0,0 +1,86 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+// clang-format off
+FixStyle(efield/kk,FixEfieldKokkos<LMPDeviceType>);
+FixStyle(efield/kk/device,FixEfieldKokkos<LMPDeviceType>);
+FixStyle(efield/kk/host,FixEfieldKokkos<LMPHostType>);
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_FIX_EFIELD_KOKKOS_H
+#define LMP_FIX_EFIELD_KOKKOS_H
+
+#include "fix_efield.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+struct e_double_4 {
+  double d0, d1, d2, d3;
+  KOKKOS_INLINE_FUNCTION
+  e_double_4() {
+    d0 = d1 = d2 = d3 = 0.0;
+  }
+  KOKKOS_INLINE_FUNCTION
+  e_double_4& operator+=(const e_double_4 &rhs) {
+    d0 += rhs.d0;
+    d1 += rhs.d1;
+    d2 += rhs.d2;
+    d3 += rhs.d3;
+    return *this;
+  }
+};
+typedef e_double_4 double_4;
+
+struct TagFixEfieldConstant{};
+
+struct TagFixEfieldNonConstant{};
+
+template<class DeviceType>
+class FixEfieldKokkos : public FixEfield {
+ public:
+  typedef DeviceType device_type;
+  typedef double_4 value_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  FixEfieldKokkos(class LAMMPS *, int, char **);
+  ~FixEfieldKokkos() override;
+  void init() override;
+  void post_force(int) override;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixEfieldConstant, const int&, double_4&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixEfieldNonConstant, const int&, double_4&) const;
+
+ private:
+  DAT::tdual_ffloat_2d k_efield;
+  typename AT::t_ffloat_2d_randomread d_efield;
+  typename AT::t_int_1d d_match;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_float_1d_randomread q;
+  typename AT::t_f_array f;
+  typename AT::t_imageint_1d_randomread image;
+  typename AT::t_int_1d_randomread mask;
+};
+
+}
+
+#endif
+#endif
+
diff --git a/src/KOKKOS/fix_property_atom_kokkos.cpp b/src/KOKKOS/fix_property_atom_kokkos.cpp
index 1de07b39dc..dcd943cac6 100644
--- a/src/KOKKOS/fix_property_atom_kokkos.cpp
+++ b/src/KOKKOS/fix_property_atom_kokkos.cpp
@@ -30,7 +30,46 @@ FixPropertyAtomKokkos::FixPropertyAtomKokkos(LAMMPS *lmp, int narg, char **arg)
   FixPropertyAtom(lmp, narg, arg)
 {
   atomKK = (AtomKokkos *) atom;
-  grow_arrays(atom->nmax);
+  kokkosable = 1;
+
+  dvector_flag = 0;
+  for (int nv = 0; nv < nvalue; nv++)
+    if (styles[nv] == DVEC) dvector_flag = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixPropertyAtomKokkos::post_constructor()
+{
+  atomKK->update_property_atom();
+
+  FixPropertyAtom::post_constructor();
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixPropertyAtomKokkos::~FixPropertyAtomKokkos()
+{
+  // deallocate per-atom vectors in Atom class
+  // set ptrs to a null pointer, so they no longer exist for Atom class
+
+  for (int nv = 0; nv < nvalue; nv++) {
+    if (styles[nv] == MOLECULE) {
+      atom->molecule_flag = 0;
+      memoryKK->destroy_kokkos(atomKK->k_molecule,atom->molecule);
+      atom->molecule = nullptr;
+    } else if (styles[nv] == CHARGE) {
+      atom->q_flag = 0;
+      memoryKK->destroy_kokkos(atomKK->k_q,atom->q);
+      atom->q = nullptr;
+    } else if (styles[nv] == RMASS) {
+      atom->rmass_flag = 0;
+      memoryKK->destroy_kokkos(atomKK->k_rmass,atom->rmass);
+      atom->rmass = nullptr;
+    }
+  }
+
+  atomKK->update_property_atom();
 }
 
 /* ----------------------------------------------------------------------
@@ -44,17 +83,17 @@ void FixPropertyAtomKokkos::grow_arrays(int nmax)
 {
   for (int nv = 0; nv < nvalue; nv++) {
     if (styles[nv] == MOLECULE) {
-      memory->grow(atom->molecule,nmax,"atom:molecule");
-      size_t nbytes = (nmax-nmax_old) * sizeof(tagint);
-      memset(&atom->molecule[nmax_old],0,nbytes);
+      atomKK->sync(Device,MOLECULE_MASK);
+      memoryKK->grow_kokkos(atomKK->k_molecule,atom->molecule,nmax,"atom:molecule");
+      atomKK->modified(Device,MOLECULE_MASK);
     } else if (styles[nv] == CHARGE) {
-      memory->grow(atom->q,nmax,"atom:q");
-      size_t nbytes = (nmax-nmax_old) * sizeof(double);
-      memset(&atom->q[nmax_old],0,nbytes);
+      atomKK->sync(Device,Q_MASK);
+      memoryKK->grow_kokkos(atomKK->k_q,atom->q,nmax,"atom:q");
+      atomKK->modified(Device,Q_MASK);
     } else if (styles[nv] == RMASS) {
-      memory->grow(atom->rmass,nmax,"atom:rmass");
-      size_t nbytes = (nmax-nmax_old) * sizeof(double);
-      memset(&atom->rmass[nmax_old],0,nbytes);
+      atomKK->sync(Device,RMASS_MASK);
+      memoryKK->grow_kokkos(atomKK->k_rmass,atom->rmass,nmax,"atom:rmass");
+      atomKK->modified(Device,RMASS_MASK);
     } else if (styles[nv] == TEMPERATURE) {
       memory->grow(atom->temperature, nmax, "atom:temperature");
       size_t nbytes = (nmax - nmax_old) * sizeof(double);
@@ -69,7 +108,7 @@ void FixPropertyAtomKokkos::grow_arrays(int nmax)
       memset(&atom->ivector[index[nv]][nmax_old],0,nbytes);
     } else if (styles[nv] == DVEC) {
       atomKK->sync(Device,DVECTOR_MASK);
-      memoryKK->grow_kokkos(atomKK->k_dvector,atomKK->dvector,atomKK->k_dvector.extent(0),nmax,
+      memoryKK->grow_kokkos(atomKK->k_dvector,atom->dvector,atomKK->k_dvector.extent(0),nmax,
                           "atom:dvector");
       atomKK->modified(Device,DVECTOR_MASK);
     } else if (styles[nv] == IARRAY) {
@@ -84,3 +123,62 @@ void FixPropertyAtomKokkos::grow_arrays(int nmax)
   }
   nmax_old = nmax;
 }
+
+/* ---------------------------------------------------------------------- */
+
+void FixPropertyAtomKokkos::sync(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if (molecule_flag && (mask & MOLECULE_MASK)) atomKK->k_molecule.sync<LMPDeviceType>();
+    if (q_flag && (mask & Q_MASK)) atomKK->k_q.sync<LMPDeviceType>();
+    if (rmass_flag && (mask & RMASS_MASK)) {atomKK->k_rmass.sync<LMPDeviceType>();}
+    if (dvector_flag && (mask & DVECTOR_MASK)) atomKK->k_dvector.sync<LMPDeviceType>();
+  } else {
+    if (molecule_flag && (mask & MOLECULE_MASK)) atomKK->k_molecule.sync<LMPHostType>();
+    if (q_flag && (mask & Q_MASK)) atomKK->k_q.sync<LMPHostType>();
+    if (rmass_flag && (mask & RMASS_MASK)) atomKK->k_rmass.sync<LMPHostType>();
+    if (dvector_flag && (mask & DVECTOR_MASK)) atomKK->k_dvector.sync<LMPHostType>();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixPropertyAtomKokkos::sync_overlapping_device(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if ((mask & MOLECULE_MASK) && atomKK->k_molecule.need_sync<LMPDeviceType>())
+      atomKK->avecKK->perform_async_copy<DAT::tdual_tagint_1d>(atomKK->k_molecule,space);
+    if ((mask & Q_MASK) && atomKK->k_q.need_sync<LMPDeviceType>())
+      atomKK->avecKK->perform_async_copy<DAT::tdual_float_1d>(atomKK->k_q,space);
+    if ((mask & RMASS_MASK) && atomKK->k_rmass.need_sync<LMPDeviceType>())
+      atomKK->avecKK->perform_async_copy<DAT::tdual_float_1d>(atomKK->k_rmass,space);
+    if ((mask & DVECTOR_MASK) && atomKK->k_dvector.need_sync<LMPDeviceType>())
+      atomKK->avecKK->perform_async_copy<DAT::tdual_float_2d>(atomKK->k_dvector,space);
+  } else {
+    if ((mask & MOLECULE_MASK) && atomKK->k_molecule.need_sync<LMPHostType>())
+      atomKK->avecKK->perform_async_copy<DAT::tdual_tagint_1d>(atomKK->k_molecule,space);
+    if ((mask & Q_MASK) && atomKK->k_q.need_sync<LMPHostType>())
+      atomKK->avecKK->perform_async_copy<DAT::tdual_float_1d>(atomKK->k_q,space);
+    if ((mask & RMASS_MASK) && atomKK->k_rmass.need_sync<LMPHostType>())
+      atomKK->avecKK->perform_async_copy<DAT::tdual_float_1d>(atomKK->k_rmass,space);
+    if ((mask & DVECTOR_MASK) && atomKK->k_dvector.need_sync<LMPHostType>())
+      atomKK->avecKK->perform_async_copy<DAT::tdual_float_2d>(atomKK->k_dvector,space);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixPropertyAtomKokkos::modified(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if (molecule_flag && (mask & MOLECULE_MASK)) atomKK->k_molecule.modify<LMPDeviceType>();
+    if (q_flag && (mask & Q_MASK)) atomKK->k_q.modify<LMPDeviceType>();
+    if (rmass_flag && (mask & RMASS_MASK)) atomKK->k_rmass.modify<LMPDeviceType>();
+    if (dvector_flag && (mask & DVECTOR_MASK)) atomKK->k_dvector.modify<LMPDeviceType>();
+  } else {
+    if (molecule_flag && (mask & MOLECULE_MASK)) atomKK->k_molecule.modify<LMPHostType>();
+    if (q_flag && (mask & Q_MASK)) atomKK->k_q.modify<LMPHostType>();
+    if (rmass_flag && (mask & RMASS_MASK)) atomKK->k_rmass.modify<LMPHostType>();
+    if (dvector_flag && (mask & DVECTOR_MASK)) atomKK->k_dvector.modify<LMPHostType>();
+  }
+}
diff --git a/src/KOKKOS/fix_property_atom_kokkos.h b/src/KOKKOS/fix_property_atom_kokkos.h
index 90eddc98e0..adbe6ab20b 100644
--- a/src/KOKKOS/fix_property_atom_kokkos.h
+++ b/src/KOKKOS/fix_property_atom_kokkos.h
@@ -22,14 +22,23 @@ FixStyle(property/atom/kk,FixPropertyAtomKokkos);
 #define LMP_FIX_PROPERTY_ATOM_KOKKOS_H
 
 #include "fix_property_atom.h"
+#include "atom_vec_kokkos.h"
 
 namespace LAMMPS_NS {
 
 class FixPropertyAtomKokkos : public FixPropertyAtom {
  public:
   FixPropertyAtomKokkos(class LAMMPS *, int, char **);
-
+  void post_constructor() override;
+  ~FixPropertyAtomKokkos() override;
   void grow_arrays(int) override;
+
+  void sync(ExecutionSpace space, unsigned int mask);
+  void modified(ExecutionSpace space, unsigned int mask);
+  void sync_overlapping_device(ExecutionSpace space, unsigned int mask);
+
+ private:
+  int dvector_flag;
 };
 
 }
diff --git a/src/KOKKOS/fix_spring_self_kokkos.cpp b/src/KOKKOS/fix_spring_self_kokkos.cpp
new file mode 100644
index 0000000000..efd8a652ff
--- /dev/null
+++ b/src/KOKKOS/fix_spring_self_kokkos.cpp
@@ -0,0 +1,332 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Trung Nguyen (U Chicago)
+------------------------------------------------------------------------- */
+
+#include "fix_spring_self_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "update.h"
+#include "modify.h"
+#include "domain_kokkos.h"
+#include "region.h"
+#include "input.h"
+#include "variable.h"
+#include "memory_kokkos.h"
+#include "error.h"
+#include "atom_masks.h"
+#include "kokkos_base.h"
+
+#include <cstring>
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixSpringSelfKokkos<DeviceType>::FixSpringSelfKokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixSpringSelf(lmp, narg, arg)
+{
+  kokkosable = 1;
+  exchange_comm_device = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  xoriginal_tmp = xoriginal;
+  xoriginal = nullptr;
+
+  int nmax = atom->nmax;
+  grow_arrays(nmax);
+
+  for (int i = 0; i < atom->nlocal; i++) {
+    k_xoriginal.h_view(i,0) = xoriginal_tmp[i][0];
+    k_xoriginal.h_view(i,1) = xoriginal_tmp[i][1];
+    k_xoriginal.h_view(i,2) = xoriginal_tmp[i][2];
+  }
+
+  k_xoriginal.modify_host();
+
+  d_count = typename AT::t_int_scalar("spring/self:count");
+  h_count = Kokkos::create_mirror_view(d_count);
+
+  memory->destroy(xoriginal_tmp);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixSpringSelfKokkos<DeviceType>::~FixSpringSelfKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_xoriginal,xoriginal);
+  xoriginal = nullptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixSpringSelfKokkos<DeviceType>::init()
+{
+  FixSpringSelf::init();
+
+  if (utils::strmatch(update->integrate_style,"^respa"))
+    error->all(FLERR,"Cannot (yet) use respa with Kokkos");
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixSpringSelfKokkos<DeviceType>::post_force(int /*vflag*/)
+{
+  atomKK->sync(execution_space, X_MASK | F_MASK | IMAGE_MASK | MASK_MASK);
+
+  x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  image = atomKK->k_image.view<DeviceType>();
+  mask = atomKK->k_mask.view<DeviceType>();
+  int nlocal = atom->nlocal;
+
+  double espring_kk;
+
+  k_xoriginal.modify<LMPHostType>();
+  k_xoriginal.sync<DeviceType>();
+
+  copymode = 1;
+
+  {
+  // local variables for lambda capture
+  auto prd = Few<double,3>(domain->prd);
+  auto h = Few<double,6>(domain->h);
+  auto triclinic = domain->triclinic;
+  auto l_k = k;
+  auto l_xoriginal = d_xoriginal;
+
+  auto l_x = x;
+  auto l_f = f;
+  auto l_mask = mask;
+  auto l_image = image;
+  auto l_groupbit = groupbit;
+  auto l_xflag = xflag;
+  auto l_yflag = yflag;
+  auto l_zflag = zflag;
+
+  Kokkos::parallel_reduce(nlocal, LAMMPS_LAMBDA(const int& i, double& espring_kk) {
+    if (l_mask[i] & l_groupbit) {
+      Few<double,3> x_i;
+      x_i[0] = l_x(i,0);
+      x_i[1] = l_x(i,1);
+      x_i[2] = l_x(i,2);
+      auto unwrap = DomainKokkos::unmap(prd,h,triclinic,x_i,l_image(i));
+      auto dx = unwrap[0] - l_xoriginal(i, 0);
+      auto dy = unwrap[1] - l_xoriginal(i, 1);
+      auto dz = unwrap[2] - l_xoriginal(i, 2);
+      if (!l_xflag) dx = 0.0;
+      if (!l_yflag) dy = 0.0;
+      if (!l_zflag) dz = 0.0;
+      l_f(i,0) -= l_k*dx;
+      l_f(i,1) -= l_k*dy;
+      l_f(i,2) -= l_k*dz;
+      espring_kk += l_k * (dx*dx + dy*dy + dz*dz);
+    }
+  },espring_kk);
+  }
+
+  copymode = 0;
+
+  atomKK->modified(execution_space, F_MASK);
+
+  espring = 0.5*espring_kk;
+}
+
+/* ----------------------------------------------------------------------
+   allocate local atom-based arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixSpringSelfKokkos<DeviceType>::grow_arrays(int nmax)
+{
+  memoryKK->grow_kokkos(k_xoriginal,xoriginal,nmax,"spring/self:xoriginal");
+  d_xoriginal = k_xoriginal.view<DeviceType>();
+}
+
+/* ----------------------------------------------------------------------
+   copy values within local atom-based arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixSpringSelfKokkos<DeviceType>::copy_arrays(int i, int j, int delflag)
+{
+  k_xoriginal.sync_host();
+
+  FixSpringSelf::copy_arrays(i,j,delflag);
+
+  k_xoriginal.modify_host();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixSpringSelfKokkos<DeviceType>::pack_exchange_item(const int &mysend, int &offset, const bool &final) const
+{
+  const int i = d_exchange_sendlist(mysend);
+
+  d_buf[mysend] = nsend + offset;
+  int m = nsend + offset;
+  d_buf[m++] = d_xoriginal(i,0);
+  d_buf[m++] = d_xoriginal(i,1);
+  d_buf[m++] = d_xoriginal(i,2);
+  if (mysend == nsend-1) d_count() = m;
+  offset = m - nsend;
+
+  const int j = d_copylist(mysend);
+  if (j > -1) {
+    d_xoriginal(i,0) = d_xoriginal(j,0);
+    d_xoriginal(i,1) = d_xoriginal(j,1);
+    d_xoriginal(i,2) = d_xoriginal(j,2);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixSpringSelfKokkos<DeviceType>::pack_exchange_kokkos(
+   const int &nsend, DAT::tdual_xfloat_2d &k_buf,
+   DAT::tdual_int_1d k_exchange_sendlist, DAT::tdual_int_1d k_copylist,
+   ExecutionSpace space)
+{
+
+  k_buf.sync<DeviceType>();
+  k_copylist.sync<DeviceType>();
+  k_exchange_sendlist.sync<DeviceType>();
+
+  d_buf = typename ArrayTypes<DeviceType>::t_xfloat_1d_um(
+    k_buf.template view<DeviceType>().data(),
+    k_buf.extent(0)*k_buf.extent(1));
+  d_copylist = k_copylist.view<DeviceType>();
+  d_exchange_sendlist = k_exchange_sendlist.view<DeviceType>();
+  this->nsend = nsend;
+
+
+  k_xoriginal.template sync<DeviceType>();
+
+  Kokkos::deep_copy(d_count,0);
+
+  copymode = 1;
+
+  FixSpringSelfKokkosPackExchangeFunctor<DeviceType> pack_exchange_functor(this);
+  Kokkos::parallel_scan(nsend,pack_exchange_functor);
+
+  copymode = 0;
+
+  k_buf.modify<DeviceType>();
+
+  if (space == Host) k_buf.sync<LMPHostType>();
+  else k_buf.sync<LMPDeviceType>();
+
+  k_xoriginal.template modify<DeviceType>();
+
+  Kokkos::deep_copy(h_count,d_count);
+
+  return h_count();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixSpringSelfKokkos<DeviceType>::operator()(TagFixSpringSelfUnpackExchange, const int &i) const
+{
+  int index = d_indices(i);
+
+  if (index > -1) {
+    int m = d_buf[i];
+
+    d_xoriginal(index,0) = static_cast<tagint> (d_buf[m++]);
+    d_xoriginal(index,1) = static_cast<tagint> (d_buf[m++]);
+    d_xoriginal(index,2) = static_cast<tagint> (d_buf[m++]);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+void FixSpringSelfKokkos<DeviceType>::unpack_exchange_kokkos(
+  DAT::tdual_xfloat_2d &k_buf, DAT::tdual_int_1d &k_indices, int nrecv,
+  ExecutionSpace /*space*/)
+{
+  k_buf.sync<DeviceType>();
+  k_indices.sync<DeviceType>();
+
+  d_buf = typename ArrayTypes<DeviceType>::t_xfloat_1d_um(
+    k_buf.template view<DeviceType>().data(),
+    k_buf.extent(0)*k_buf.extent(1));
+  d_indices = k_indices.view<DeviceType>();
+
+  k_xoriginal.template sync<DeviceType>();
+
+  copymode = 1;
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagFixSpringSelfUnpackExchange>(0,nrecv),*this);
+
+  copymode = 0;
+
+  k_xoriginal.template modify<DeviceType>();
+}
+
+/* ----------------------------------------------------------------------
+   pack values in local atom-based arrays for exchange with another proc
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixSpringSelfKokkos<DeviceType>::pack_exchange(int i, double *buf)
+{
+  k_xoriginal.sync_host();
+
+  int m = FixSpringSelf::pack_exchange(i,buf);
+
+  k_xoriginal.modify_host();
+
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   unpack values in local atom-based arrays from exchange with another proc
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixSpringSelfKokkos<DeviceType>::unpack_exchange(int nlocal, double *buf)
+{
+  k_xoriginal.sync_host();
+
+  int m = FixSpringSelf::unpack_exchange(nlocal,buf);
+
+  k_xoriginal.modify_host();
+
+  return m;
+}
+
+namespace LAMMPS_NS {
+template class FixSpringSelfKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class FixSpringSelfKokkos<LMPHostType>;
+#endif
+}
+
diff --git a/src/KOKKOS/fix_spring_self_kokkos.h b/src/KOKKOS/fix_spring_self_kokkos.h
new file mode 100644
index 0000000000..b23e92249b
--- /dev/null
+++ b/src/KOKKOS/fix_spring_self_kokkos.h
@@ -0,0 +1,108 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+// clang-format off
+FixStyle(spring/self/kk,FixSpringSelfKokkos<LMPDeviceType>);
+FixStyle(spring/self/kk/device,FixSpringSelfKokkos<LMPDeviceType>);
+FixStyle(spring/self/kk/host,FixSpringSelfKokkos<LMPHostType>);
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_FIX_SPRING_SELF_KOKKOS_H
+#define LMP_FIX_SPRING_SELF_KOKKOS_H
+
+#include "fix_spring_self.h"
+#include "kokkos_type.h"
+#include "kokkos_base.h"
+
+namespace LAMMPS_NS {
+
+struct TagFixSpringSelfUnpackExchange{};
+
+template<class DeviceType>
+class FixSpringSelfKokkos : public FixSpringSelf, public KokkosBase {
+ public:
+  typedef DeviceType device_type;
+  typedef double value_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  FixSpringSelfKokkos(class LAMMPS *, int, char **);
+  ~FixSpringSelfKokkos() override;
+  void init() override;
+  void grow_arrays(int) override;
+  void copy_arrays(int, int, int) override;
+  void post_force(int) override;
+
+  KOKKOS_INLINE_FUNCTION
+  void pack_exchange_item(const int&, int &, const bool &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixSpringSelfUnpackExchange, const int&) const;
+
+  int pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &buf,
+                           DAT::tdual_int_1d k_sendlist,
+                           DAT::tdual_int_1d k_copylist,
+                           ExecutionSpace space) override;
+
+  void unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,
+                              DAT::tdual_int_1d &indices,int nrecv,
+                              ExecutionSpace space) override;
+
+
+  int pack_exchange(int, double *) override;
+  int unpack_exchange(int, double *) override;
+
+ protected:
+  DAT::tdual_x_array k_xoriginal;
+  typename AT::t_x_array d_xoriginal;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_f_array f;
+  typename AT::t_imageint_1d_randomread image;
+  typename AT::t_int_1d_randomread mask;
+
+  int nsend;
+
+  typename AT::t_int_2d d_sendlist;
+  typename AT::t_xfloat_1d_um d_buf;
+
+  typename AT::t_int_1d d_exchange_sendlist;
+  typename AT::t_int_1d d_copylist;
+  typename AT::t_int_1d d_indices;
+
+  typename AT::t_int_scalar d_count;
+  HAT::t_int_scalar h_count;
+
+  double **xoriginal_tmp;    // original coords of atoms
+
+};
+
+template <class DeviceType>
+struct FixSpringSelfKokkosPackExchangeFunctor {
+  typedef DeviceType device_type;
+  typedef int value_type;
+  FixSpringSelfKokkos<DeviceType> c;
+  FixSpringSelfKokkosPackExchangeFunctor(FixSpringSelfKokkos<DeviceType>* c_ptr):c(*c_ptr) {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int &i, int &offset, const bool &final) const {
+    c.pack_exchange_item(i, offset, final);
+  }
+};
+
+}
+
+#endif
+#endif
+
diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp
index 91ea6d37ac..84a8f59dd0 100644
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@@ -137,13 +137,13 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
 
       int set_flag = 0;
       char *str;
-      if ((str = getenv("SLURM_LOCALID"))) {
+      if (str = getenv("SLURM_LOCALID")) {
         int local_rank = atoi(str);
         device = local_rank % ngpus;
         if (device >= skip_gpu) device++;
         set_flag = 1;
       }
-      if ((str = getenv("MPT_LRANK"))) {
+      if (str = getenv("FLUX_TASK_LOCAL_ID")) {
         if (ngpus > 0) {
           int local_rank = atoi(str);
           device = local_rank % ngpus;
@@ -151,7 +151,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
           set_flag = 1;
         }
       }
-      if ((str = getenv("MV2_COMM_WORLD_LOCAL_RANK"))) {
+      if (str = getenv("MPT_LRANK")) {
         if (ngpus > 0) {
           int local_rank = atoi(str);
           device = local_rank % ngpus;
@@ -159,7 +159,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
           set_flag = 1;
         }
       }
-      if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) {
+      if (str = getenv("MV2_COMM_WORLD_LOCAL_RANK")) {
         if (ngpus > 0) {
           int local_rank = atoi(str);
           device = local_rank % ngpus;
@@ -167,7 +167,15 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
           set_flag = 1;
         }
       }
-      if ((str = getenv("PMI_LOCAL_RANK"))) {
+      if (str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) {
+        if (ngpus > 0) {
+          int local_rank = atoi(str);
+          device = local_rank % ngpus;
+          if (device >= skip_gpu) device++;
+          set_flag = 1;
+        }
+      }
+      if (str = getenv("PMI_LOCAL_RANK")) {
         if (ngpus > 0) {
           int local_rank = atoi(str);
           device = local_rank % ngpus;
diff --git a/src/KOKKOS/kokkos_base.h b/src/KOKKOS/kokkos_base.h
index 7d9ecb5d80..1e22a38657 100644
--- a/src/KOKKOS/kokkos_base.h
+++ b/src/KOKKOS/kokkos_base.h
@@ -41,11 +41,6 @@ class KokkosBase {
                                            int, int *) {return 0;};
   virtual void unpack_forward_comm_fix_kokkos(int, int, DAT::tdual_xfloat_1d &) {}
 
-
-  // Region
-  virtual void match_all_kokkos(int, DAT::tdual_int_1d) {}
-
-  // Fix
   virtual int pack_exchange_kokkos(const int & /*nsend*/, DAT::tdual_xfloat_2d & /*k_buf*/,
                                    DAT::tdual_int_1d /*k_sendlist*/,
                                    DAT::tdual_int_1d /*k_copylist*/,
@@ -54,6 +49,9 @@ class KokkosBase {
                                       DAT::tdual_int_1d & /*indices*/, int /*nrecv*/,
                                       ExecutionSpace /*space*/) {}
 
+  // Region
+  virtual void match_all_kokkos(int, DAT::tdual_int_1d) {}
+
   using KeyViewType = DAT::t_x_array;
   using BinOp = BinOp3DLAMMPS<KeyViewType>;
   virtual void
diff --git a/src/KOKKOS/modify_kokkos.cpp b/src/KOKKOS/modify_kokkos.cpp
index 0b81a1cabb..8d8ffca671 100644
--- a/src/KOKKOS/modify_kokkos.cpp
+++ b/src/KOKKOS/modify_kokkos.cpp
@@ -362,6 +362,17 @@ void ModifyKokkos::pre_reverse(int eflag, int vflag)
 
 void ModifyKokkos::post_force(int vflag)
 {
+  for (int i = 0; i < n_post_force_group; i++) {
+    atomKK->sync(fix[list_post_force_group[i]]->execution_space,
+                 fix[list_post_force_group[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
+    if (!fix[list_post_force_group[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
+    fix[list_post_force_group[i]]->post_force(vflag);
+    lmp->kokkos->auto_sync = prev_auto_sync;
+    atomKK->modified(fix[list_post_force_group[i]]->execution_space,
+                     fix[list_post_force_group[i]]->datamask_modify);
+  }
+
   for (int i = 0; i < n_post_force; i++) {
     atomKK->sync(fix[list_post_force[i]]->execution_space,
                  fix[list_post_force[i]]->datamask_read);
diff --git a/src/KOKKOS/neigh_bond_kokkos.cpp b/src/KOKKOS/neigh_bond_kokkos.cpp
index 4cfe440b1f..b749590779 100644
--- a/src/KOKKOS/neigh_bond_kokkos.cpp
+++ b/src/KOKKOS/neigh_bond_kokkos.cpp
@@ -112,9 +112,8 @@ void NeighBondKokkos<DeviceType>::init_topology_kk() {
   int i,m;
   int bond_off = 0;
   int angle_off = 0;
-  for (i = 0; i < modify->nfix; i++)
-    if ((strcmp(modify->fix[i]->style,"shake") == 0)
-        || (strcmp(modify->fix[i]->style,"rattle") == 0))
+  for (const auto &ifix : modify->get_fix_list())
+    if (utils::strmatch(ifix->style,"^shake") || utils::strmatch(ifix->style,"^rattle"))
       bond_off = angle_off = 1;
   if (force->bond && force->bond_match("quartic")) bond_off = 1;
 
diff --git a/src/KOKKOS/neighbor_kokkos.cpp b/src/KOKKOS/neighbor_kokkos.cpp
index 0b40bce841..efb1247560 100644
--- a/src/KOKKOS/neighbor_kokkos.cpp
+++ b/src/KOKKOS/neighbor_kokkos.cpp
@@ -308,7 +308,8 @@ void NeighborKokkos::build_kokkos(int topoflag)
   for (i = 0; i < npair_perpetual; i++) {
     m = plist[i];
     if (!lists[m]->kokkos) atomKK->sync(Host,ALL_MASK);
-    if (!lists[m]->copy) lists[m]->grow(nlocal,nall);
+    if (!lists[m]->copy || lists[m]->trim || lists[m]->kk2cpu)
+      lists[m]->grow(nlocal,nall);
     neigh_pair[m]->build_setup();
     neigh_pair[m]->build(lists[m]);
   }
diff --git a/src/KOKKOS/npair_halffull_kokkos.cpp b/src/KOKKOS/npair_halffull_kokkos.cpp
index ec17cec844..c8c4d57fc9 100644
--- a/src/KOKKOS/npair_halffull_kokkos.cpp
+++ b/src/KOKKOS/npair_halffull_kokkos.cpp
@@ -18,6 +18,7 @@
 #include "atom_masks.h"
 #include "atom_vec.h"
 #include "domain.h"
+#include "force.h"
 #include "neigh_list_kokkos.h"
 
 #include <cmath>
@@ -26,8 +27,8 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType, int NEWTON, int TRIM>
-NPairHalffullKokkos<DeviceType,NEWTON,TRIM>::NPairHalffullKokkos(LAMMPS *lmp) : NPair(lmp) {
+template<class DeviceType, int NEWTON, int TRI, int TRIM>
+NPairHalffullKokkos<DeviceType,NEWTON,TRI,TRIM>::NPairHalffullKokkos(LAMMPS *lmp) : NPair(lmp) {
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
 }
@@ -41,13 +42,14 @@ NPairHalffullKokkos<DeviceType,NEWTON,TRIM>::NPairHalffullKokkos(LAMMPS *lmp) :
    if ghost, also store neighbors of ghost atoms & set inum,gnum correctly
 ------------------------------------------------------------------------- */
 
-template<class DeviceType, int NEWTON, int TRIM>
-void NPairHalffullKokkos<DeviceType,NEWTON,TRIM>::build(NeighList *list)
+template<class DeviceType, int NEWTON, int TRI, int TRIM>
+void NPairHalffullKokkos<DeviceType,NEWTON,TRI,TRIM>::build(NeighList *list)
 {
   if (NEWTON || TRIM) {
     x = atomKK->k_x.view<DeviceType>();
     atomKK->sync(execution_space,X_MASK);
   }
+
   nlocal = atom->nlocal;
 
   cutsq_custom = cutoff_custom*cutoff_custom;
@@ -66,6 +68,8 @@ void NPairHalffullKokkos<DeviceType,NEWTON,TRIM>::build(NeighList *list)
   d_numneigh = k_list->d_numneigh;
   d_neighbors = k_list->d_neighbors;
 
+  delta = 0.01 * force->angstrom;
+
   // loop over parent full list
 
   copymode = 1;
@@ -78,9 +82,9 @@ void NPairHalffullKokkos<DeviceType,NEWTON,TRIM>::build(NeighList *list)
   k_list->k_ilist.template modify<DeviceType>();
 }
 
-template<class DeviceType, int NEWTON, int TRIM>
+template<class DeviceType, int NEWTON, int TRI, int TRIM>
 KOKKOS_INLINE_FUNCTION
-void NPairHalffullKokkos<DeviceType,NEWTON,TRIM>::operator()(TagNPairHalffullCompute, const int &ii) const {
+void NPairHalffullKokkos<DeviceType,NEWTON,TRI,TRIM>::operator()(TagNPairHalffullCompute, const int &ii) const {
   int n = 0;
 
   const int i = d_ilist_full(ii);
@@ -92,6 +96,11 @@ void NPairHalffullKokkos<DeviceType,NEWTON,TRIM>::operator()(TagNPairHalffullCom
   }
 
   // loop over full neighbor list
+  // use i < j < nlocal to eliminate half the local/local interactions
+  // for triclinic, must use delta to eliminate half the local/ghost interactions
+  // cannot use I/J exact coord comparision as for orthog
+  //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+  //   with an added PBC offset can shift all 3 coords by epsilon
 
   const int jnum = d_numneigh_full(i);
   const AtomNeighbors neighbors_i = AtomNeighbors(&d_neighbors(i,0),d_numneigh(i),
@@ -103,6 +112,14 @@ void NPairHalffullKokkos<DeviceType,NEWTON,TRIM>::operator()(TagNPairHalffullCom
     if (NEWTON) {
       if (j < nlocal) {
         if (i > j) continue;
+      } else if (TRI) {
+        if (fabs(x(j,2)-ztmp) > delta) {
+          if (x(j,2) < ztmp) continue;
+        } else if (fabs(x(j,1)-ytmp) > delta) {
+          if (x(j,1) < ytmp) continue;
+        } else {
+          if (x(j,0) < xtmp) continue;
+        }
       } else {
         if (x(j,2) < ztmp) continue;
         if (x(j,2) == ztmp) {
@@ -141,14 +158,18 @@ void NPairHalffullKokkos<DeviceType,NEWTON,TRIM>::operator()(TagNPairHalffullCom
 }
 
 namespace LAMMPS_NS {
-template class NPairHalffullKokkos<LMPDeviceType,0,0>;
-template class NPairHalffullKokkos<LMPDeviceType,0,1>;
-template class NPairHalffullKokkos<LMPDeviceType,1,0>;
-template class NPairHalffullKokkos<LMPDeviceType,1,1>;
+template class NPairHalffullKokkos<LMPDeviceType,0,0,0>;
+template class NPairHalffullKokkos<LMPDeviceType,0,0,1>;
+template class NPairHalffullKokkos<LMPDeviceType,1,0,0>;
+template class NPairHalffullKokkos<LMPDeviceType,1,0,1>;
+template class NPairHalffullKokkos<LMPDeviceType,1,1,0>;
+template class NPairHalffullKokkos<LMPDeviceType,1,1,1>;
 #ifdef LMP_KOKKOS_GPU
-template class NPairHalffullKokkos<LMPHostType,0,0>;
-template class NPairHalffullKokkos<LMPHostType,0,1>;
-template class NPairHalffullKokkos<LMPHostType,1,0>;
-template class NPairHalffullKokkos<LMPHostType,1,1>;
+template class NPairHalffullKokkos<LMPHostType,0,0,0>;
+template class NPairHalffullKokkos<LMPHostType,0,0,1>;
+template class NPairHalffullKokkos<LMPHostType,1,0,0>;
+template class NPairHalffullKokkos<LMPHostType,1,0,1>;
+template class NPairHalffullKokkos<LMPHostType,1,1,0>;
+template class NPairHalffullKokkos<LMPHostType,1,1,1>;
 #endif
 }
diff --git a/src/KOKKOS/npair_halffull_kokkos.h b/src/KOKKOS/npair_halffull_kokkos.h
index c5a09f0b62..98526c7fee 100644
--- a/src/KOKKOS/npair_halffull_kokkos.h
+++ b/src/KOKKOS/npair_halffull_kokkos.h
@@ -16,53 +16,79 @@
 
 // Trim off
 
-// Newton
+// Newton, no triclinic 
 
-typedef NPairHalffullKokkos<LMPDeviceType,1,0> NPairKokkosHalffullNewtonDevice;
+typedef NPairHalffullKokkos<LMPDeviceType,1,0,0> NPairKokkosHalffullNewtonDevice;
 NPairStyle(halffull/newton/kk/device,
            NPairKokkosHalffullNewtonDevice,
            NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
-           NP_ORTHO | NP_TRI | NP_KOKKOS_DEVICE);
+           NP_ORTHO | NP_KOKKOS_DEVICE);
 
-typedef NPairHalffullKokkos<LMPHostType,1,0> NPairKokkosHalffullNewtonHost;
+typedef NPairHalffullKokkos<LMPHostType,1,0,0> NPairKokkosHalffullNewtonHost;
 NPairStyle(halffull/newton/kk/host,
            NPairKokkosHalffullNewtonHost,
            NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
-           NP_ORTHO | NP_TRI | NP_KOKKOS_HOST);
+           NP_ORTHO | NP_KOKKOS_HOST);
 
-typedef NPairHalffullKokkos<LMPDeviceType,1,0> NPairKokkosHalffullNewtonDevice;
+typedef NPairHalffullKokkos<LMPDeviceType,1,0,0> NPairKokkosHalffullNewtonDevice;
 NPairStyle(halffull/newton/skip/kk/device,
            NPairKokkosHalffullNewtonDevice,
            NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
-           NP_ORTHO | NP_TRI | NP_SKIP | NP_KOKKOS_DEVICE);
+           NP_ORTHO | NP_SKIP | NP_KOKKOS_DEVICE);
 
-typedef NPairHalffullKokkos<LMPHostType,1,0> NPairKokkosHalffullNewtonHost;
+typedef NPairHalffullKokkos<LMPHostType,1,0,0> NPairKokkosHalffullNewtonHost;
 NPairStyle(halffull/newton/skip/kk/host,
            NPairKokkosHalffullNewtonHost,
            NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_SKIP | NP_KOKKOS_HOST);
+
+// Newton, triclinic
+
+typedef NPairHalffullKokkos<LMPDeviceType,1,1,0> NPairKokkosHalffullNewtonTriDevice;
+NPairStyle(halffull/newton/tri/kk/device,
+           NPairKokkosHalffullNewtonTriDevice,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_TRI | NP_KOKKOS_DEVICE);
+
+typedef NPairHalffullKokkos<LMPHostType,1,1,0> NPairKokkosHalffullNewtonTriHost;
+NPairStyle(halffull/newton/tri/kk/host,
+           NPairKokkosHalffullNewtonTriHost,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_TRI | NP_KOKKOS_HOST);
+
+typedef NPairHalffullKokkos<LMPDeviceType,1,1,0> NPairKokkosHalffullNewtonTriDevice;
+NPairStyle(halffull/newton/tri/skip/kk/device,
+           NPairKokkosHalffullNewtonTriDevice,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_TRI | NP_SKIP | NP_KOKKOS_DEVICE);
+
+typedef NPairHalffullKokkos<LMPHostType,1,1,0> NPairKokkosHalffullNewtonTriHost;
+NPairStyle(halffull/newton/tri/skip/kk/host,
+           NPairKokkosHalffullNewtonTriHost,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_SKIP | NP_KOKKOS_HOST);
 
-// Newtoff
+// Newtoff (can be triclinic but template param always set to 0)
 
-typedef NPairHalffullKokkos<LMPDeviceType,0,0> NPairKokkosHalffullNewtoffDevice;
+typedef NPairHalffullKokkos<LMPDeviceType,0,0,0> NPairKokkosHalffullNewtoffDevice;
 NPairStyle(halffull/newtoff/kk/device,
            NPairKokkosHalffullNewtoffDevice,
            NP_HALF_FULL | NP_NEWTOFF | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_KOKKOS_DEVICE);
 
-typedef NPairHalffullKokkos<LMPHostType,0,0> NPairKokkosHalffullNewtoffHost;
+typedef NPairHalffullKokkos<LMPHostType,0,0,0> NPairKokkosHalffullNewtoffHost;
 NPairStyle(halffull/newtoff/kk/host,
            NPairKokkosHalffullNewtoffHost,
            NP_HALF_FULL | NP_NEWTOFF | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_KOKKOS_HOST);
 
-typedef NPairHalffullKokkos<LMPDeviceType,0,0> NPairKokkosHalffullNewtoffDevice;
+typedef NPairHalffullKokkos<LMPDeviceType,0,0,0> NPairKokkosHalffullNewtoffDevice;
 NPairStyle(halffull/newtoff/skip/kk/device,
            NPairKokkosHalffullNewtoffDevice,
            NP_HALF_FULL | NP_NEWTOFF | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_SKIP | NP_KOKKOS_DEVICE);
 
-typedef NPairHalffullKokkos<LMPHostType,0,0> NPairKokkosHalffullNewtoffHost;
+typedef NPairHalffullKokkos<LMPHostType,0,0,0> NPairKokkosHalffullNewtoffHost;
 NPairStyle(halffull/newtoff/skip/kk/host,
            NPairKokkosHalffullNewtoffHost,
            NP_HALF_FULL | NP_NEWTOFF | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
@@ -70,166 +96,244 @@ NPairStyle(halffull/newtoff/skip/kk/host,
 
 //************ Ghost **************
 
-// Newton
+// Newton, no triclinic
 
-typedef NPairHalffullKokkos<LMPDeviceType,1,0> NPairKokkosHalffullNewtonGhostDevice;
+typedef NPairHalffullKokkos<LMPDeviceType,1,0,0> NPairKokkosHalffullNewtonDevice;
 NPairStyle(halffull/newton/ghost/kk/device,
-           NPairKokkosHalffullNewtonGhostDevice,
+           NPairKokkosHalffullNewtonDevice,
            NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
-           NP_ORTHO | NP_TRI | NP_GHOST | NP_KOKKOS_DEVICE);
+           NP_ORTHO | NP_GHOST | NP_KOKKOS_DEVICE);
 
-typedef NPairHalffullKokkos<LMPHostType,1,0> NPairKokkosHalffullNewtonHost;
+typedef NPairHalffullKokkos<LMPHostType,1,0,0> NPairKokkosHalffullNewtonHost;
 NPairStyle(halffull/newton/ghost/kk/host,
            NPairKokkosHalffullNewtonHost,
            NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
-           NP_ORTHO | NP_TRI | NP_GHOST | NP_KOKKOS_HOST);
+           NP_ORTHO | NP_GHOST | NP_KOKKOS_HOST);
 
-typedef NPairHalffullKokkos<LMPDeviceType,1,0> NPairKokkosHalffullNewtonGhostDevice;
+typedef NPairHalffullKokkos<LMPDeviceType,1,0,0> NPairKokkosHalffullNewtonDevice;
 NPairStyle(halffull/newton/skip/ghost/kk/device,
-           NPairKokkosHalffullNewtonGhostDevice,
+           NPairKokkosHalffullNewtonDevice,
            NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
-           NP_ORTHO | NP_TRI | NP_GHOST | NP_SKIP | NP_KOKKOS_DEVICE);
+           NP_ORTHO | NP_GHOST | NP_SKIP | NP_KOKKOS_DEVICE);
 
-typedef NPairHalffullKokkos<LMPHostType,1,0> NPairKokkosHalffullNewtonHost;
+typedef NPairHalffullKokkos<LMPHostType,1,0,0> NPairKokkosHalffullNewtonHost;
 NPairStyle(halffull/newton/skip/ghost/kk/host,
            NPairKokkosHalffullNewtonHost,
            NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_GHOST | NP_SKIP | NP_KOKKOS_HOST);
+
+// Newton, triclinic
+
+typedef NPairHalffullKokkos<LMPDeviceType,1,1,0> NPairKokkosHalffullNewtonTriDevice;
+NPairStyle(halffull/newton/tri/ghost/kk/device,
+           NPairKokkosHalffullNewtonTriDevice,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_TRI | NP_GHOST | NP_KOKKOS_DEVICE);
+
+typedef NPairHalffullKokkos<LMPHostType,1,1,0> NPairKokkosHalffullNewtonTriHost;
+NPairStyle(halffull/newton/tri/ghost/kk/host,
+           NPairKokkosHalffullNewtonTriHost,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_TRI | NP_GHOST | NP_KOKKOS_HOST);
+
+typedef NPairHalffullKokkos<LMPDeviceType,1,1,0> NPairKokkosHalffullNewtonTriDevice;
+NPairStyle(halffull/newton/tri/skip/ghost/kk/device,
+           NPairKokkosHalffullNewtonTriDevice,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_TRI | NP_GHOST | NP_SKIP | NP_KOKKOS_DEVICE);
+
+typedef NPairHalffullKokkos<LMPHostType,1,1,0> NPairKokkosHalffullNewtonTriHost;
+NPairStyle(halffull/newton/tri/skip/ghost/kk/host,
+           NPairKokkosHalffullNewtonTriHost,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_GHOST | NP_SKIP | NP_KOKKOS_HOST);
 
-// Newtoff
+// Newtoff (can be triclinic but template param always set to 0)
 
-typedef NPairHalffullKokkos<LMPDeviceType,0,0> NPairKokkosHalffullNewtoffGhostDevice;
+typedef NPairHalffullKokkos<LMPDeviceType,0,0,0> NPairKokkosHalffullNewtoffDevice;
 NPairStyle(halffull/newtoff/ghost/kk/device,
-           NPairKokkosHalffullNewtoffGhostDevice,
+           NPairKokkosHalffullNewtoffDevice,
            NP_HALF_FULL | NP_NEWTOFF | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_GHOST | NP_KOKKOS_DEVICE);
 
-typedef NPairHalffullKokkos<LMPHostType,0,0> NPairKokkosHalffullNewtoffHost;
+typedef NPairHalffullKokkos<LMPHostType,0,0,0> NPairKokkosHalffullNewtoffHost;
 NPairStyle(halffull/newtoff/ghost/kk/host,
            NPairKokkosHalffullNewtoffHost,
            NP_HALF_FULL | NP_NEWTOFF | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_GHOST | NP_KOKKOS_HOST);
 
-typedef NPairHalffullKokkos<LMPDeviceType,0,0> NPairKokkosHalffullNewtoffGhostDevice;
+typedef NPairHalffullKokkos<LMPDeviceType,0,0,0> NPairKokkosHalffullNewtoffDevice;
 NPairStyle(halffull/newtoff/skip/ghost/kk/device,
-           NPairKokkosHalffullNewtoffGhostDevice,
+           NPairKokkosHalffullNewtoffDevice,
            NP_HALF_FULL | NP_NEWTOFF | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_GHOST | NP_SKIP | NP_KOKKOS_DEVICE);
 
-typedef NPairHalffullKokkos<LMPHostType,0,0> NPairKokkosHalffullNewtoffHost;
+typedef NPairHalffullKokkos<LMPHostType,0,0,0> NPairKokkosHalffullNewtoffHost;
 NPairStyle(halffull/newtoff/skip/ghost/kk/host,
            NPairKokkosHalffullNewtoffHost,
            NP_HALF_FULL | NP_NEWTOFF | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_GHOST | NP_SKIP | NP_KOKKOS_HOST);
 
-
 //************ Trim **************
 
-// Newton
+// Newton, no triclinic
 
-typedef NPairHalffullKokkos<LMPDeviceType,1,1> NPairKokkosHalffullNewtonTrimDevice;
+typedef NPairHalffullKokkos<LMPDeviceType,1,0,1> NPairKokkosHalffullNewtonTrimDevice;
 NPairStyle(halffull/newton/trim/kk/device,
            NPairKokkosHalffullNewtonTrimDevice,
            NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
-           NP_ORTHO | NP_TRI | NP_TRIM | NP_KOKKOS_DEVICE);
+           NP_ORTHO | NP_TRIM | NP_KOKKOS_DEVICE);
 
-typedef NPairHalffullKokkos<LMPHostType,1,1> NPairKokkosHalffullNewtonTrimHost;
+typedef NPairHalffullKokkos<LMPHostType,1,0,1> NPairKokkosHalffullNewtonTrimHost;
 NPairStyle(halffull/newton/trim/kk/host,
            NPairKokkosHalffullNewtonTrimHost,
            NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_TRIM | NP_KOKKOS_HOST);
+
+typedef NPairHalffullKokkos<LMPDeviceType,1,0,1> NPairKokkosHalffullNewtonTrimDevice;
+NPairStyle(halffull/newton/trim/skip/kk/device,
+           NPairKokkosHalffullNewtonTrimDevice,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_SKIP | NP_TRIM | NP_KOKKOS_DEVICE);
+
+typedef NPairHalffullKokkos<LMPHostType,1,0,1> NPairKokkosHalffullNewtonTrimHost;
+NPairStyle(halffull/newton/trim/skip/kk/host,
+           NPairKokkosHalffullNewtonTrimHost,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_SKIP | NP_TRIM | NP_KOKKOS_HOST);
+
+// Newton, triclinic
+
+typedef NPairHalffullKokkos<LMPDeviceType,1,1,1> NPairKokkosHalffullNewtonTriTrimDevice;
+NPairStyle(halffull/newton/tri/trim/kk/device,
+           NPairKokkosHalffullNewtonTriTrimDevice,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_TRI | NP_TRIM | NP_KOKKOS_DEVICE);
+
+typedef NPairHalffullKokkos<LMPHostType,1,1,1> NPairKokkosHalffullNewtonTriTrimHost;
+NPairStyle(halffull/newton/tri/trim/kk/host,
+           NPairKokkosHalffullNewtonTriTrimHost,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_TRIM | NP_KOKKOS_HOST);
 
-typedef NPairHalffullKokkos<LMPDeviceType,1,1> NPairKokkosHalffullNewtonTrimDevice;
-NPairStyle(halffull/newton/skip/trim/kk/device,
+typedef NPairHalffullKokkos<LMPDeviceType,1,1,1> NPairKokkosHalffullNewtonTriTrimDevice;
+NPairStyle(halffull/newton/tri/trim/skip/kk/device,
            NPairKokkosHalffullNewtonTrimDevice,
            NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_SKIP | NP_TRIM | NP_KOKKOS_DEVICE);
 
-typedef NPairHalffullKokkos<LMPHostType,1,1> NPairKokkosHalffullNewtonTrimHost;
-NPairStyle(halffull/newton/skip/trim/kk/host,
-           NPairKokkosHalffullNewtonTrimHost,
+typedef NPairHalffullKokkos<LMPHostType,1,1,1> NPairKokkosHalffullNewtonTriTrimHost;
+NPairStyle(halffull/newton/tri/trim/skip/kk/host,
+           NPairKokkosHalffullNewtonTriTrimHost,
            NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_SKIP | NP_TRIM | NP_KOKKOS_HOST);
 
-// Newtoff
+// Newtoff (can be triclinic but template param always set to 0)
 
-typedef NPairHalffullKokkos<LMPDeviceType,0,1> NPairKokkosHalffullNewtoffTrimDevice;
+typedef NPairHalffullKokkos<LMPDeviceType,0,0,1> NPairKokkosHalffullNewtoffTrimDevice;
 NPairStyle(halffull/newtoff/trim/kk/device,
            NPairKokkosHalffullNewtoffTrimDevice,
            NP_HALF_FULL | NP_NEWTOFF | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_TRIM | NP_KOKKOS_DEVICE);
 
-typedef NPairHalffullKokkos<LMPHostType,0,1> NPairKokkosHalffullNewtoffTrimHost;
+typedef NPairHalffullKokkos<LMPHostType,0,0,1> NPairKokkosHalffullNewtoffTrimHost;
 NPairStyle(halffull/newtoff/trim/kk/host,
            NPairKokkosHalffullNewtoffTrimHost,
            NP_HALF_FULL | NP_NEWTOFF | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_TRIM | NP_KOKKOS_HOST);
 
-typedef NPairHalffullKokkos<LMPDeviceType,0,1> NPairKokkosHalffullNewtoffTrimDevice;
-NPairStyle(halffull/newtoff/skip/trim/kk/device,
+typedef NPairHalffullKokkos<LMPDeviceType,0,0,1> NPairKokkosHalffullNewtoffTrimDevice;
+NPairStyle(halffull/newtoff/trim/skip/kk/device,
            NPairKokkosHalffullNewtoffTrimDevice,
            NP_HALF_FULL | NP_NEWTOFF | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_SKIP | NP_TRIM | NP_KOKKOS_DEVICE);
 
-typedef NPairHalffullKokkos<LMPHostType,0,1> NPairKokkosHalffullNewtoffTrimHost;
-NPairStyle(halffull/newtoff/skip/trim/kk/host,
+typedef NPairHalffullKokkos<LMPHostType,0,0,1> NPairKokkosHalffullNewtoffTrimHost;
+NPairStyle(halffull/newtoff/trim/skip/kk/host,
            NPairKokkosHalffullNewtoffTrimHost,
            NP_HALF_FULL | NP_NEWTOFF | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_SKIP |  NP_TRIM | NP_KOKKOS_HOST);
 
 //************ Ghost **************
 
-// Newton
+// Newton, no triclinic
 
-typedef NPairHalffullKokkos<LMPDeviceType,1,1> NPairKokkosHalffullNewtonGhostTrimDevice;
-NPairStyle(halffull/newton/ghost/trim/kk/device,
-           NPairKokkosHalffullNewtonGhostTrimDevice,
+typedef NPairHalffullKokkos<LMPDeviceType,1,0,1> NPairKokkosHalffullNewtonTrimDevice;
+NPairStyle(halffull/newton/tri/trim/ghost/kk/device,
+           NPairKokkosHalffullNewtonTrimDevice,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_GHOST | NP_TRIM | NP_KOKKOS_DEVICE);
+
+typedef NPairHalffullKokkos<LMPHostType,1,0,1> NPairKokkosHalffullNewtonTrimHost;
+NPairStyle(halffull/newton/trim/ghost/kk/host,
+           NPairKokkosHalffullNewtonTrimHost,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_GHOST | NP_TRIM | NP_KOKKOS_HOST);
+
+typedef NPairHalffullKokkos<LMPDeviceType,1,0,1> NPairKokkosHalffullNewtonTrimDevice;
+NPairStyle(halffull/newton/trim/skip/ghost/kk/device,
+           NPairKokkosHalffullNewtonTrimDevice,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_GHOST | NP_SKIP | NP_TRIM | NP_KOKKOS_DEVICE);
+
+typedef NPairHalffullKokkos<LMPHostType,1,0,1> NPairKokkosHalffullNewtonTrimHost;
+NPairStyle(halffull/newton/trim/skip/ghost/kk/host,
+           NPairKokkosHalffullNewtonTrimHost,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_GHOST | NP_SKIP | NP_TRIM | NP_KOKKOS_HOST);
+
+// Newton, triclinic
+
+typedef NPairHalffullKokkos<LMPDeviceType,1,1,1> NPairKokkosHalffullNewtonTriTrimDevice;
+NPairStyle(halffull/newton/tri/trim/ghost/kk/device,
+           NPairKokkosHalffullNewtonTriTrimDevice,
            NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_GHOST | NP_TRIM | NP_KOKKOS_DEVICE);
 
-typedef NPairHalffullKokkos<LMPHostType,1,1> NPairKokkosHalffullNewtonTrimHost;
-NPairStyle(halffull/newton/ghost/trim/kk/host,
-           NPairKokkosHalffullNewtonTrimHost,
+typedef NPairHalffullKokkos<LMPHostType,1,1,1> NPairKokkosHalffullNewtonTriTrimHost;
+NPairStyle(halffull/newton/tri/trim/ghost/kk/host,
+           NPairKokkosHalffullNewtonTriTrimHost,
            NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_GHOST | NP_TRIM | NP_KOKKOS_HOST);
 
-typedef NPairHalffullKokkos<LMPDeviceType,1,1> NPairKokkosHalffullNewtonGhostTrimDevice;
-NPairStyle(halffull/newton/skip/ghost/trim/kk/device,
-           NPairKokkosHalffullNewtonGhostTrimDevice,
+typedef NPairHalffullKokkos<LMPDeviceType,1,1,1> NPairKokkosHalffullNewtonTriTrimDevice;
+NPairStyle(halffull/newton/tri/trim/skip/ghost/kk/device,
+           NPairKokkosHalffullNewtonTriTrimDevice,
            NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_GHOST | NP_SKIP | NP_TRIM | NP_KOKKOS_DEVICE);
 
-typedef NPairHalffullKokkos<LMPHostType,1,1> NPairKokkosHalffullNewtonTrimHost;
-NPairStyle(halffull/newton/skip/ghost/trim/kk/host,
-           NPairKokkosHalffullNewtonTrimHost,
+typedef NPairHalffullKokkos<LMPHostType,1,1,1> NPairKokkosHalffullNewtonTriTrimHost;
+NPairStyle(halffull/newton/tri/trim/skip/ghost/kk/host,
+           NPairKokkosHalffullNewtonTriTrimHost,
            NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_GHOST | NP_SKIP | NP_TRIM | NP_KOKKOS_HOST);
 
-// Newtoff
+// Newtoff (can be triclinic but template param always set to 0)
 
-typedef NPairHalffullKokkos<LMPDeviceType,0,1> NPairKokkosHalffullNewtoffGhostTrimDevice;
-NPairStyle(halffull/newtoff/ghost/trim/kk/device,
-           NPairKokkosHalffullNewtoffGhostTrimDevice,
+typedef NPairHalffullKokkos<LMPDeviceType,0,0,1> NPairKokkosHalffullNewtoffTrimDevice;
+NPairStyle(halffull/newtoff/trim/ghost/kk/device,
+           NPairKokkosHalffullNewtoffTrimDevice,
            NP_HALF_FULL | NP_NEWTOFF | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_GHOST | NP_TRIM | NP_KOKKOS_DEVICE);
 
-typedef NPairHalffullKokkos<LMPHostType,0,1> NPairKokkosHalffullNewtoffTrimHost;
-NPairStyle(halffull/newtoff/ghost/trim/kk/host,
+typedef NPairHalffullKokkos<LMPHostType,0,0,1> NPairKokkosHalffullNewtoffTrimHost;
+NPairStyle(halffull/newtoff/trim/ghost/kk/host,
            NPairKokkosHalffullNewtoffTrimHost,
            NP_HALF_FULL | NP_NEWTOFF | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_GHOST | NP_TRIM | NP_KOKKOS_HOST);
 
-typedef NPairHalffullKokkos<LMPDeviceType,0,1> NPairKokkosHalffullNewtoffGhostTrimDevice;
-NPairStyle(halffull/newtoff/skip/ghost/trim/kk/device,
-           NPairKokkosHalffullNewtoffGhostTrimDevice,
+typedef NPairHalffullKokkos<LMPDeviceType,0,0,1> NPairKokkosHalffullNewtoffTrimDevice;
+NPairStyle(halffull/newtoff/trim/skip/ghost/kk/device,
+           NPairKokkosHalffullNewtoffTrimDevice,
            NP_HALF_FULL | NP_NEWTOFF | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_GHOST | NP_SKIP | NP_TRIM | NP_KOKKOS_DEVICE);
 
-typedef NPairHalffullKokkos<LMPHostType,0,1> NPairKokkosHalffullNewtoffTrimHost;
-NPairStyle(halffull/newtoff/skip/ghost/trim/kk/host,
+typedef NPairHalffullKokkos<LMPHostType,0,0,1> NPairKokkosHalffullNewtoffTrimHost;
+NPairStyle(halffull/newtoff/trim/skip/ghost/kk/host,
            NPairKokkosHalffullNewtoffTrimHost,
            NP_HALF_FULL | NP_NEWTOFF | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
            NP_ORTHO | NP_TRI | NP_GHOST | NP_SKIP | NP_TRIM | NP_KOKKOS_HOST);
+
 // clang-format on
 #else
 
@@ -244,7 +348,7 @@ namespace LAMMPS_NS {
 
 struct TagNPairHalffullCompute{};
 
-template<class DeviceType, int NEWTON, int TRIM>
+template<class DeviceType, int NEWTON, int TRI, int TRIM>
 class NPairHalffullKokkos : public NPair {
  public:
   typedef DeviceType device_type;
@@ -257,8 +361,8 @@ class NPairHalffullKokkos : public NPair {
   void operator()(TagNPairHalffullCompute, const int&) const;
 
  private:
-  int nlocal;
-  double cutsq_custom;
+  int nlocal,triclinic;
+  double cutsq_custom,delta;
 
   typename AT::t_x_array_randomread x;
 
diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp
index 06567cbeb6..f677b3a1bf 100644
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@@ -155,6 +155,8 @@ void NPairKokkos<DeviceType,HALF,NEWTON,GHOST,TRI,SIZE>::build(NeighList *list_)
 
   list->grow(nall);
 
+  const double delta = 0.01 * force->angstrom;
+
   NeighborKokkosExecute<DeviceType>
     data(*list,
          k_cutneighsq.view<DeviceType>(),
@@ -176,7 +178,7 @@ void NPairKokkos<DeviceType,HALF,NEWTON,GHOST,TRI,SIZE>::build(NeighList *list_)
          atomKK->molecular,
          nbinx,nbiny,nbinz,mbinx,mbiny,mbinz,mbinxlo,mbinylo,mbinzlo,
          bininvx,bininvy,bininvz,
-         exclude, nex_type,
+         delta, exclude, nex_type,
          k_ex1_type.view<DeviceType>(),
          k_ex2_type.view<DeviceType>(),
          k_ex_type.view<DeviceType>(),
@@ -217,6 +219,8 @@ void NPairKokkos<DeviceType,HALF,NEWTON,GHOST,TRI,SIZE>::build(NeighList *list_)
       atomKK->sync(Device,X_MASK|RADIUS_MASK|TYPE_MASK);
   }
 
+  if (HALF && NEWTON && TRI) atomKK->sync(Device,TAG_MASK);
+
   data.special_flag[0] = special_flag[0];
   data.special_flag[1] = special_flag[1];
   data.special_flag[2] = special_flag[2];
@@ -261,7 +265,7 @@ void NPairKokkos<DeviceType,HALF,NEWTON,GHOST,TRI,SIZE>::build(NeighList *list_)
 //#endif
     } else {
       if (SIZE) {
-        NPairKokkosBuildFunctorSize<DeviceType,HALF,NEWTON,TRI> f(data,atoms_per_bin * 6 * sizeof(X_FLOAT) * factor);
+        NPairKokkosBuildFunctorSize<DeviceType,HALF,NEWTON,TRI> f(data,atoms_per_bin * 7 * sizeof(X_FLOAT) * factor);
 #ifdef LMP_KOKKOS_GPU
         if (ExecutionSpaceFromDevice<DeviceType>::space == Device) {
           int team_size = atoms_per_bin*factor;
@@ -279,7 +283,7 @@ void NPairKokkos<DeviceType,HALF,NEWTON,GHOST,TRI,SIZE>::build(NeighList *list_)
         Kokkos::parallel_for(nall, f);
 #endif
       } else {
-        NPairKokkosBuildFunctor<DeviceType,HALF,NEWTON,TRI> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
+        NPairKokkosBuildFunctor<DeviceType,HALF,NEWTON,TRI> f(data,atoms_per_bin * 6 * sizeof(X_FLOAT) * factor);
 #ifdef LMP_KOKKOS_GPU
         if (ExecutionSpaceFromDevice<DeviceType>::space == Device) {
           int team_size = atoms_per_bin*factor;
@@ -414,6 +418,8 @@ void NeighborKokkosExecute<DeviceType>::
   const X_FLOAT ytmp = x(i, 1);
   const X_FLOAT ztmp = x(i, 2);
   const int itype = type(i);
+  tagint itag;
+  if (HalfNeigh && Newton && Tri) itag = tag(i);
 
   const int ibin = c_atom2bin(i);
 
@@ -484,13 +490,29 @@ void NeighborKokkosExecute<DeviceType>::
 
         if (HalfNeigh && !Newton && j <= i) continue;
         if (!HalfNeigh && j == i) continue;
+
+        // for triclinic, bin stencil is full in all 3 dims
+        // must use itag/jtag to eliminate half the I/J interactions
+        // cannot use I/J exact coord comparision
+        //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+        //   with an added PBC offset can shift all 3 coords by epsilon
+
         if (HalfNeigh && Newton && Tri) {
-          if (x(j,2) < ztmp) continue;
-          if (x(j,2) == ztmp) {
-            if (x(j,1) < ytmp) continue;
-            if (x(j,1) == ytmp) {
-              if (x(j,0) < xtmp) continue;
-              if (x(j,0) == xtmp && j <= i) continue;
+          if (j <= i) continue;
+          if (j >= nlocal) {
+            const tagint jtag = tag(j);
+            if (itag > jtag) {
+              if ((itag+jtag) % 2 == 0) continue;
+            } else if (itag < jtag) {
+              if ((itag+jtag) % 2 == 1) continue;
+            } else {
+              if (fabs(x(j,2)-ztmp) > delta) {
+                if (x(j,2) < ztmp) continue;
+              } else if (fabs(x(j,1)-ytmp) > delta) {
+                if (x(j,1) < ytmp) continue;
+              } else {
+                if (x(j,0) < xtmp) continue;
+              }
             }
           }
         }
@@ -568,8 +590,9 @@ void NeighborKokkosExecute<DeviceType>::build_ItemGPU(typename Kokkos::TeamPolic
                                                       size_t sharedsize) const
 {
   auto* sharedmem = static_cast<X_FLOAT *>(dev.team_shmem().get_shmem(sharedsize));
-  /* loop over atoms in i's bin,
-  */
+
+  // loop over atoms in i's bin
+
   const int atoms_per_bin = c_bins.extent(1);
   const int BINS_PER_TEAM = dev.team_size()/atoms_per_bin <1?1:dev.team_size()/atoms_per_bin;
   const int TEAMS_PER_BIN = atoms_per_bin/dev.team_size()<1?1:atoms_per_bin/dev.team_size();
@@ -579,15 +602,14 @@ void NeighborKokkosExecute<DeviceType>::build_ItemGPU(typename Kokkos::TeamPolic
 
   if (ibin >= mbins) return;
 
-  X_FLOAT* other_x = sharedmem + 5*atoms_per_bin*MY_BIN;
-  int* other_id = (int*) &other_x[4 * atoms_per_bin];
+  X_FLOAT* other_x = sharedmem + 6*atoms_per_bin*MY_BIN;
+  int* other_id = (int*) &other_x[5 * atoms_per_bin];
 
   int bincount_current = c_bincount[ibin];
 
   for (int kk = 0; kk < TEAMS_PER_BIN; kk++) {
     const int MY_II = dev.team_rank()%atoms_per_bin+kk*dev.team_size();
     const int i = MY_II < bincount_current ? c_bins(ibin, MY_II) : -1;
-    /* if necessary, goto next page and add pages */
 
     int n = 0;
 
@@ -595,6 +617,7 @@ void NeighborKokkosExecute<DeviceType>::build_ItemGPU(typename Kokkos::TeamPolic
     X_FLOAT ytmp;
     X_FLOAT ztmp;
     int itype;
+    tagint itag;
     const int index = (i >= 0 && i < nlocal) ? i : 0;
     const AtomNeighbors neighbors_i = neigh_transpose ?
     neigh_list.get_neighbors_transpose(index) : neigh_list.get_neighbors(index);
@@ -608,6 +631,10 @@ void NeighborKokkosExecute<DeviceType>::build_ItemGPU(typename Kokkos::TeamPolic
       other_x[MY_II + atoms_per_bin] = ytmp;
       other_x[MY_II + 2 * atoms_per_bin] = ztmp;
       other_x[MY_II + 3 * atoms_per_bin] = itype;
+      if (HalfNeigh && Newton && Tri) {
+        itag = tag(i);
+        other_x[MY_II + 4 * atoms_per_bin] = itag;
+      }
     }
     other_id[MY_II] = i;
 
@@ -695,6 +722,8 @@ void NeighborKokkosExecute<DeviceType>::build_ItemGPU(typename Kokkos::TeamPolic
         other_x[MY_II + atoms_per_bin] = x(j, 1);
         other_x[MY_II + 2 * atoms_per_bin] = x(j, 2);
         other_x[MY_II + 3 * atoms_per_bin] = type(j);
+        if (HalfNeigh && Newton && Tri)
+          other_x[MY_II + 4 * atoms_per_bin] = tag(j);
       }
 
       other_id[MY_II] = j;
@@ -708,13 +737,29 @@ void NeighborKokkosExecute<DeviceType>::build_ItemGPU(typename Kokkos::TeamPolic
 
           if (HalfNeigh && !Newton && j <= i) continue;
           if (!HalfNeigh && j == i) continue;
+
+          // for triclinic, bin stencil is full in all 3 dims
+          // must use itag/jtag to eliminate half the I/J interactions
+          // cannot use I/J exact coord comparision
+          //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+          //   with an added PBC offset can shift all 3 coords by epsilon
+
           if (HalfNeigh && Newton && Tri) {
-            if (x(j,2) < ztmp) continue;
-            if (x(j,2) == ztmp) {
-              if (x(j,1) < ytmp) continue;
-              if (x(j,1) == ytmp) {
-                if (x(j,0) < xtmp) continue;
-                if (x(j,0) == xtmp && j <= i) continue;
+            if (j <= i) continue;
+            if (j >= nlocal) {
+              const tagint jtag = other_x[m + 4 * atoms_per_bin];
+              if (itag > jtag) {
+                if ((itag+jtag) % 2 == 0) continue;
+              } else if (itag < jtag) {
+                if ((itag+jtag) % 2 == 1) continue;
+              } else {
+                if (fabs(x(j,2)-ztmp) > delta) {
+                  if (x(j,2) < ztmp) continue;
+                } else if (fabs(x(j,1)-ytmp) > delta) {
+                  if (x(j,1) < ytmp) continue;
+                } else {
+                  if (x(j,0) < xtmp) continue;
+                }
               }
             }
           }
@@ -905,6 +950,7 @@ void NeighborKokkosExecute<DeviceType>::build_ItemGhostGPU(typename Kokkos::Team
                                                       size_t sharedsize) const
 {
   auto* sharedmem = static_cast<X_FLOAT *>(dev.team_shmem().get_shmem(sharedsize));
+
   // loop over atoms in i's bin
 
   const int atoms_per_bin = c_bins.extent(1);
@@ -1084,6 +1130,8 @@ void NeighborKokkosExecute<DeviceType>::
   const X_FLOAT ztmp = x(i, 2);
   const X_FLOAT radi = radius(i);
   const int itype = type(i);
+  tagint itag;
+  if (HalfNeigh && Newton && Tri) itag = tag(i);
 
   const int ibin = c_atom2bin(i);
 
@@ -1167,13 +1215,29 @@ void NeighborKokkosExecute<DeviceType>::
 
       if (HalfNeigh && !Newton && j <= i) continue;
       if (!HalfNeigh && j == i) continue;
+
+      // for triclinic, bin stencil is full in all 3 dims
+      // must use itag/jtag to eliminate half the I/J interactions
+      // cannot use I/J exact coord comparision
+      //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+      //   with an added PBC offset can shift all 3 coords by epsilon
+
       if (HalfNeigh && Newton && Tri) {
-        if (x(j,2) < ztmp) continue;
-        if (x(j,2) == ztmp) {
-          if (x(j,1) < ytmp) continue;
-          if (x(j,1) == ytmp) {
-            if (x(j,0) < xtmp) continue;
-            if (x(j,0) == xtmp && j <= i) continue;
+        if (j <= i) continue;
+        if (j >= nlocal) {
+          const tagint jtag = tag(j);
+          if (itag > jtag) {
+            if ((itag+jtag) % 2 == 0) continue;
+          } else if (itag < jtag) {
+            if ((itag+jtag) % 2 == 1) continue;
+          } else {
+            if (fabs(x(j,2)-ztmp) > delta) {
+              if (x(j,2) < ztmp) continue;
+            } else if (fabs(x(j,1)-ytmp) > delta) {
+              if (x(j,1) < ytmp) continue;
+            } else {
+              if (x(j,0) < xtmp) continue;
+            }
           }
         }
       }
@@ -1245,8 +1309,9 @@ void NeighborKokkosExecute<DeviceType>::build_ItemSizeGPU(typename Kokkos::TeamP
                                                           size_t sharedsize) const
 {
   auto* sharedmem = static_cast<X_FLOAT *>(dev.team_shmem().get_shmem(sharedsize));
-  /* loop over atoms in i's bin,
-   */
+
+  // loop over atoms in i's bin
+
   const int atoms_per_bin = c_bins.extent(1);
   const int BINS_PER_TEAM = dev.team_size()/atoms_per_bin <1?1:dev.team_size()/atoms_per_bin;
   const int TEAMS_PER_BIN = atoms_per_bin/dev.team_size()<1?1:atoms_per_bin/dev.team_size();
@@ -1256,15 +1321,14 @@ void NeighborKokkosExecute<DeviceType>::build_ItemSizeGPU(typename Kokkos::TeamP
 
   if (ibin >= mbins) return;
 
-  X_FLOAT* other_x = sharedmem + 6*atoms_per_bin*MY_BIN;
-  int* other_id = (int*) &other_x[5 * atoms_per_bin];
+  X_FLOAT* other_x = sharedmem + 7*atoms_per_bin*MY_BIN;
+  int* other_id = (int*) &other_x[6 * atoms_per_bin];
 
   int bincount_current = c_bincount[ibin];
 
   for (int kk = 0; kk < TEAMS_PER_BIN; kk++) {
     const int MY_II = dev.team_rank()%atoms_per_bin+kk*dev.team_size();
     const int i = MY_II < bincount_current ? c_bins(ibin, MY_II) : -1;
-    /* if necessary, goto next page and add pages */
 
     int n = 0;
 
@@ -1273,6 +1337,7 @@ void NeighborKokkosExecute<DeviceType>::build_ItemSizeGPU(typename Kokkos::TeamP
     X_FLOAT ztmp;
     X_FLOAT radi;
     int itype;
+    tagint itag;
     const int index = (i >= 0 && i < nlocal) ? i : 0;
     const AtomNeighbors neighbors_i = neigh_transpose ?
     neigh_list.get_neighbors_transpose(index) : neigh_list.get_neighbors(index);
@@ -1289,6 +1354,10 @@ void NeighborKokkosExecute<DeviceType>::build_ItemSizeGPU(typename Kokkos::TeamP
       other_x[MY_II + 2 * atoms_per_bin] = ztmp;
       other_x[MY_II + 3 * atoms_per_bin] = itype;
       other_x[MY_II + 4 * atoms_per_bin] = radi;
+      if (HalfNeigh && Newton && Tri) { 
+        itag = tag(i);
+        other_x[MY_II + 5 * atoms_per_bin] = itag;
+      }
     }
     other_id[MY_II] = i;
 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
@@ -1381,6 +1450,8 @@ void NeighborKokkosExecute<DeviceType>::build_ItemSizeGPU(typename Kokkos::TeamP
         other_x[MY_II + 2 * atoms_per_bin] = x(j, 2);
         other_x[MY_II + 3 * atoms_per_bin] = type(j);
         other_x[MY_II + 4 * atoms_per_bin] = radius(j);
+        if (HalfNeigh && Newton && Tri)
+          other_x[MY_II + 5 * atoms_per_bin] = tag(j);
       }
 
       other_id[MY_II] = j;
@@ -1394,13 +1465,29 @@ void NeighborKokkosExecute<DeviceType>::build_ItemSizeGPU(typename Kokkos::TeamP
 
           if (HalfNeigh && !Newton && j <= i) continue;
           if (!HalfNeigh && j == i) continue;
+
+          // for triclinic, bin stencil is full in all 3 dims
+          // must use itag/jtag to eliminate half the I/J interactions
+          // cannot use I/J exact coord comparision
+          //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+          //   with an added PBC offset can shift all 3 coords by epsilon
+
           if (HalfNeigh && Newton && Tri) {
-            if (x(j,2) < ztmp) continue;
-            if (x(j,2) == ztmp) {
-              if (x(j,1) < ytmp) continue;
-              if (x(j,1) == ytmp) {
-                if (x(j,0) < xtmp) continue;
-                if (x(j,0) == xtmp && j <= i) continue;
+            if (j <= i) continue;
+            if (j >= nlocal) {
+              const tagint jtag = other_x[m + 5 * atoms_per_bin];
+              if (itag > jtag) {
+                if ((itag+jtag) % 2 == 0) continue;
+              } else if (itag < jtag) {
+                if ((itag+jtag) % 2 == 1) continue;
+              } else {
+                if (fabs(x(j,2)-ztmp) > delta) {
+                  if (x(j,2) < ztmp) continue;
+                } else if (fabs(x(j,1)-ytmp) > delta) {
+                  if (x(j,1) < ytmp) continue;
+                } else {
+                  if (x(j,0) < xtmp) continue;
+                }
               }
             }
           }
diff --git a/src/KOKKOS/npair_kokkos.h b/src/KOKKOS/npair_kokkos.h
index 4427012926..fe5484a771 100644
--- a/src/KOKKOS/npair_kokkos.h
+++ b/src/KOKKOS/npair_kokkos.h
@@ -189,6 +189,8 @@ class NeighborKokkosExecute
  public:
   NeighListKokkos<DeviceType> neigh_list;
 
+  const double delta;
+
   // data from Neighbor class
 
   const typename AT::t_xfloat_2d_randomread cutneighsq;
@@ -282,7 +284,7 @@ class NeighborKokkosExecute
                         const int & _mbinx,const int & _mbiny,const int & _mbinz,
                         const int & _mbinxlo,const int & _mbinylo,const int & _mbinzlo,
                         const X_FLOAT &_bininvx,const X_FLOAT &_bininvy,const X_FLOAT &_bininvz,
-                        const int & _exclude,const int & _nex_type,
+                        const double &_delta,const int & _exclude,const int & _nex_type,
                         const typename AT::t_int_1d_const & _ex1_type,
                         const typename AT::t_int_1d_const & _ex2_type,
                         const typename AT::t_int_2d_const & _ex_type,
@@ -301,7 +303,7 @@ class NeighborKokkosExecute
                         const typename ArrayTypes<LMPHostType>::t_int_scalar _h_resize,
                         const typename AT::t_int_scalar _new_maxneighs,
                         const typename ArrayTypes<LMPHostType>::t_int_scalar _h_new_maxneighs):
-    neigh_list(_neigh_list), cutneighsq(_cutneighsq),exclude(_exclude),
+    neigh_list(_neigh_list), cutneighsq(_cutneighsq),delta(_delta),exclude(_exclude),
     nex_type(_nex_type),ex1_type(_ex1_type),ex2_type(_ex2_type),
     ex_type(_ex_type),nex_group(_nex_group),
     ex1_bit(_ex1_bit),ex2_bit(_ex2_bit),
diff --git a/src/KOKKOS/npair_trim_kokkos.cpp b/src/KOKKOS/npair_trim_kokkos.cpp
index 97931bf250..d04d8676d7 100644
--- a/src/KOKKOS/npair_trim_kokkos.cpp
+++ b/src/KOKKOS/npair_trim_kokkos.cpp
@@ -62,8 +62,8 @@ void NPairTrimKokkos<DeviceType>::trim_to_kokkos(NeighList *list)
   d_ilist_copy = k_list_copy->d_ilist;
   d_numneigh_copy = k_list_copy->d_numneigh;
   d_neighbors_copy = k_list_copy->d_neighbors;
-  int inum_copy = list->listcopy->inum;
-  if (list->ghost) inum_copy += list->listcopy->gnum;
+  int inum_trim = list->listcopy->inum;
+  if (list->ghost) inum_trim += list->listcopy->gnum;
 
   NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
   k_list->maxneighs = k_list_copy->maxneighs; // simple, but could be made more memory efficient
@@ -75,7 +75,7 @@ void NPairTrimKokkos<DeviceType>::trim_to_kokkos(NeighList *list)
   // loop over parent list and trim
 
   copymode = 1;
-  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagNPairTrim>(0,inum_copy),*this);
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagNPairTrim>(0,inum_trim),*this);
   copymode = 0;
 
   list->inum = k_list_copy->inum;
@@ -132,8 +132,8 @@ void NPairTrimKokkos<DeviceType>::trim_to_cpu(NeighList *list)
 
   int inum = listcopy->inum;
   int gnum = listcopy->gnum;
-  int inum_all = inum;
-  if (list->ghost) inum_all += gnum;
+  int inum_trim = inum;
+  if (list->ghost) inum_trim += gnum;
   auto h_ilist = listcopy_kk->k_ilist.h_view;
   auto h_numneigh = Kokkos::create_mirror_view_and_copy(LMPHostType(),listcopy_kk->d_numneigh);
   auto h_neighbors = Kokkos::create_mirror_view_and_copy(LMPHostType(),listcopy_kk->d_neighbors);
@@ -151,7 +151,7 @@ void NPairTrimKokkos<DeviceType>::trim_to_cpu(NeighList *list)
   MyPage<int> *ipage = list->ipage;
   ipage->reset();
 
-  for (int ii = 0; ii < inum_all; ii++) {
+  for (int ii = 0; ii < inum_trim; ii++) {
     int n = 0;
     neighptr = ipage->vget();
 
diff --git a/src/KOKKOS/pair_buck_coul_cut_kokkos.h b/src/KOKKOS/pair_buck_coul_cut_kokkos.h
index b91348d557..9b6cc31898 100644
--- a/src/KOKKOS/pair_buck_coul_cut_kokkos.h
+++ b/src/KOKKOS/pair_buck_coul_cut_kokkos.h
@@ -112,15 +112,18 @@ class PairBuckCoulCutKokkos : public PairBuckCoulCut {
 
   void allocate() override;
 
-  friend struct PairComputeFunctor<PairBuckCoulCutKokkos,FULL,true>;
+  friend struct PairComputeFunctor<PairBuckCoulCutKokkos,FULL,true,0>;
+  friend struct PairComputeFunctor<PairBuckCoulCutKokkos,FULL,true,1>;
   friend struct PairComputeFunctor<PairBuckCoulCutKokkos,HALF,true>;
   friend struct PairComputeFunctor<PairBuckCoulCutKokkos,HALFTHREAD,true>;
-  friend struct PairComputeFunctor<PairBuckCoulCutKokkos,FULL,false>;
+  friend struct PairComputeFunctor<PairBuckCoulCutKokkos,FULL,false,0>;
+  friend struct PairComputeFunctor<PairBuckCoulCutKokkos,FULL,false,1>;
   friend struct PairComputeFunctor<PairBuckCoulCutKokkos,HALF,false>;
   friend struct PairComputeFunctor<PairBuckCoulCutKokkos,HALFTHREAD,false>;
-  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulCutKokkos,FULL,void>(PairBuckCoulCutKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulCutKokkos,HALF,void>(PairBuckCoulCutKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulCutKokkos,HALFTHREAD,void>(PairBuckCoulCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulCutKokkos,FULL,0>(PairBuckCoulCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulCutKokkos,FULL,1>(PairBuckCoulCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulCutKokkos,HALF>(PairBuckCoulCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulCutKokkos,HALFTHREAD>(PairBuckCoulCutKokkos*,NeighListKokkos<DeviceType>*);
   friend EV_FLOAT pair_compute<PairBuckCoulCutKokkos,void>(PairBuckCoulCutKokkos*,
                                                             NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairBuckCoulCutKokkos>(PairBuckCoulCutKokkos*);
diff --git a/src/KOKKOS/pair_buck_coul_long_kokkos.h b/src/KOKKOS/pair_buck_coul_long_kokkos.h
index b776a84e3c..bed9b0d0f8 100644
--- a/src/KOKKOS/pair_buck_coul_long_kokkos.h
+++ b/src/KOKKOS/pair_buck_coul_long_kokkos.h
@@ -115,27 +115,33 @@ class PairBuckCoulLongKokkos : public PairBuckCoulLong {
 
   void allocate() override;
 
-  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,FULL,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,HALF,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,HALFTHREAD,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,FULL,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,HALF,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,HALFTHREAD,false,CoulLongTable<1> >;
-  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulLongKokkos,FULL,CoulLongTable<1> >(PairBuckCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulLongKokkos,HALF,CoulLongTable<1> >(PairBuckCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulLongKokkos,HALFTHREAD,CoulLongTable<1> >(PairBuckCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairBuckCoulLongKokkos,CoulLongTable<1> >(PairBuckCoulLongKokkos*,
+  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,FULL,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,FULL,true,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,HALF,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,HALFTHREAD,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,FULL,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,FULL,false,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,HALF,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,HALFTHREAD,false,0,CoulLongTable<1>>;
+  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulLongKokkos,FULL,0,CoulLongTable<1>>(PairBuckCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulLongKokkos,FULL,1,CoulLongTable<1>>(PairBuckCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulLongKokkos,HALF,0,CoulLongTable<1>>(PairBuckCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulLongKokkos,HALFTHREAD,0,CoulLongTable<1>>(PairBuckCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairBuckCoulLongKokkos,CoulLongTable<1>>(PairBuckCoulLongKokkos*,
                                                             NeighListKokkos<DeviceType>*);
-  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,FULL,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,HALF,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,HALFTHREAD,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,FULL,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,HALF,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,HALFTHREAD,false,CoulLongTable<0> >;
-  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulLongKokkos,FULL,CoulLongTable<0> >(PairBuckCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulLongKokkos,HALF,CoulLongTable<0> >(PairBuckCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulLongKokkos,HALFTHREAD,CoulLongTable<0> >(PairBuckCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairBuckCoulLongKokkos,CoulLongTable<0> >(PairBuckCoulLongKokkos*,
+  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,FULL,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,FULL,true,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,HALF,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,HALFTHREAD,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,FULL,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,FULL,false,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,HALF,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairBuckCoulLongKokkos,HALFTHREAD,false,0,CoulLongTable<0>>;
+  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulLongKokkos,FULL,0,CoulLongTable<0>>(PairBuckCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulLongKokkos,FULL,1,CoulLongTable<0>>(PairBuckCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulLongKokkos,HALF,0,CoulLongTable<0>>(PairBuckCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairBuckCoulLongKokkos,HALFTHREAD,0,CoulLongTable<0>>(PairBuckCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairBuckCoulLongKokkos,CoulLongTable<0>>(PairBuckCoulLongKokkos*,
                                                             NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairBuckCoulLongKokkos>(PairBuckCoulLongKokkos*);
 
diff --git a/src/KOKKOS/pair_buck_kokkos.h b/src/KOKKOS/pair_buck_kokkos.h
index 364716453b..15325cd56a 100644
--- a/src/KOKKOS/pair_buck_kokkos.h
+++ b/src/KOKKOS/pair_buck_kokkos.h
@@ -91,16 +91,19 @@ class PairBuckKokkos : public PairBuck {
   int nlocal,nall,eflag,vflag;
 
   void allocate() override;
-  friend struct PairComputeFunctor<PairBuckKokkos,FULL,true>;
+  friend struct PairComputeFunctor<PairBuckKokkos,FULL,true,0>;
+  friend struct PairComputeFunctor<PairBuckKokkos,FULL,true,1>;
   friend struct PairComputeFunctor<PairBuckKokkos,HALF,true>;
   friend struct PairComputeFunctor<PairBuckKokkos,HALFTHREAD,true>;
-  friend struct PairComputeFunctor<PairBuckKokkos,FULL,false>;
+  friend struct PairComputeFunctor<PairBuckKokkos,FULL,false,0>;
+  friend struct PairComputeFunctor<PairBuckKokkos,FULL,false,1>;
   friend struct PairComputeFunctor<PairBuckKokkos,HALF,false>;
   friend struct PairComputeFunctor<PairBuckKokkos,HALFTHREAD,false>;
-  friend EV_FLOAT pair_compute_neighlist<PairBuckKokkos,FULL,void>(PairBuckKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairBuckKokkos,HALF,void>(PairBuckKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairBuckKokkos,HALFTHREAD,void>(PairBuckKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairBuckKokkos,void>(PairBuckKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairBuckKokkos,FULL,0>(PairBuckKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairBuckKokkos,FULL,1>(PairBuckKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairBuckKokkos,HALF>(PairBuckKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairBuckKokkos,HALFTHREAD>(PairBuckKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairBuckKokkos>(PairBuckKokkos*,NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairBuckKokkos>(PairBuckKokkos*);
 };
 
diff --git a/src/KOKKOS/pair_coul_cut_kokkos.h b/src/KOKKOS/pair_coul_cut_kokkos.h
index 6626889660..3e0501edd9 100644
--- a/src/KOKKOS/pair_coul_cut_kokkos.h
+++ b/src/KOKKOS/pair_coul_cut_kokkos.h
@@ -112,15 +112,18 @@ class PairCoulCutKokkos : public PairCoulCut {
   double qqrd2e;
 
   void allocate() override;
-  friend struct PairComputeFunctor<PairCoulCutKokkos,FULL,true>;
+  friend struct PairComputeFunctor<PairCoulCutKokkos,FULL,true,0>;
+  friend struct PairComputeFunctor<PairCoulCutKokkos,FULL,true,1>;
   friend struct PairComputeFunctor<PairCoulCutKokkos,HALF,true>;
   friend struct PairComputeFunctor<PairCoulCutKokkos,HALFTHREAD,true>;
-  friend struct PairComputeFunctor<PairCoulCutKokkos,FULL,false>;
+  friend struct PairComputeFunctor<PairCoulCutKokkos,FULL,false,0>;
+  friend struct PairComputeFunctor<PairCoulCutKokkos,FULL,false,1>;
   friend struct PairComputeFunctor<PairCoulCutKokkos,HALF,false>;
   friend struct PairComputeFunctor<PairCoulCutKokkos,HALFTHREAD,false>;
-  friend EV_FLOAT pair_compute_neighlist<PairCoulCutKokkos,FULL,void>(PairCoulCutKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairCoulCutKokkos,HALF,void>(PairCoulCutKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairCoulCutKokkos,HALFTHREAD,void>(PairCoulCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairCoulCutKokkos,FULL,0>(PairCoulCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairCoulCutKokkos,FULL,1>(PairCoulCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairCoulCutKokkos,HALF>(PairCoulCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairCoulCutKokkos,HALFTHREAD>(PairCoulCutKokkos*,NeighListKokkos<DeviceType>*);
   friend EV_FLOAT pair_compute<PairCoulCutKokkos,void>(PairCoulCutKokkos*,
                                                        NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairCoulCutKokkos>(PairCoulCutKokkos*);
diff --git a/src/KOKKOS/pair_coul_debye_kokkos.h b/src/KOKKOS/pair_coul_debye_kokkos.h
index b6bed9d557..d239291a25 100644
--- a/src/KOKKOS/pair_coul_debye_kokkos.h
+++ b/src/KOKKOS/pair_coul_debye_kokkos.h
@@ -112,15 +112,18 @@ class PairCoulDebyeKokkos : public PairCoulDebye {
   double qqrd2e;
 
   void allocate() override;
-  friend struct PairComputeFunctor<PairCoulDebyeKokkos,FULL,true>;
+  friend struct PairComputeFunctor<PairCoulDebyeKokkos,FULL,true,0>;
+  friend struct PairComputeFunctor<PairCoulDebyeKokkos,FULL,true,1>;
   friend struct PairComputeFunctor<PairCoulDebyeKokkos,HALF,true>;
   friend struct PairComputeFunctor<PairCoulDebyeKokkos,HALFTHREAD,true>;
-  friend struct PairComputeFunctor<PairCoulDebyeKokkos,FULL,false>;
+  friend struct PairComputeFunctor<PairCoulDebyeKokkos,FULL,false,0>;
+  friend struct PairComputeFunctor<PairCoulDebyeKokkos,FULL,false,1>;
   friend struct PairComputeFunctor<PairCoulDebyeKokkos,HALF,false>;
   friend struct PairComputeFunctor<PairCoulDebyeKokkos,HALFTHREAD,false>;
-  friend EV_FLOAT pair_compute_neighlist<PairCoulDebyeKokkos,FULL,void>(PairCoulDebyeKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairCoulDebyeKokkos,HALF,void>(PairCoulDebyeKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairCoulDebyeKokkos,HALFTHREAD,void>(PairCoulDebyeKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairCoulDebyeKokkos,FULL,0>(PairCoulDebyeKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairCoulDebyeKokkos,FULL,1>(PairCoulDebyeKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairCoulDebyeKokkos,HALF>(PairCoulDebyeKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairCoulDebyeKokkos,HALFTHREAD>(PairCoulDebyeKokkos*,NeighListKokkos<DeviceType>*);
   friend EV_FLOAT pair_compute<PairCoulDebyeKokkos,void>(PairCoulDebyeKokkos*,
                                                             NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairCoulDebyeKokkos>(PairCoulDebyeKokkos*);
diff --git a/src/KOKKOS/pair_coul_long_kokkos.h b/src/KOKKOS/pair_coul_long_kokkos.h
index fcb1402028..232cdbb6df 100644
--- a/src/KOKKOS/pair_coul_long_kokkos.h
+++ b/src/KOKKOS/pair_coul_long_kokkos.h
@@ -114,27 +114,33 @@ class PairCoulLongKokkos : public PairCoulLong {
 
   void allocate() override;
 
-  friend struct PairComputeFunctor<PairCoulLongKokkos,FULL,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairCoulLongKokkos,HALF,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairCoulLongKokkos,HALFTHREAD,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairCoulLongKokkos,FULL,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairCoulLongKokkos,HALF,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairCoulLongKokkos,HALFTHREAD,false,CoulLongTable<1> >;
-  friend EV_FLOAT pair_compute_neighlist<PairCoulLongKokkos,FULL,CoulLongTable<1> >(PairCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairCoulLongKokkos,HALF,CoulLongTable<1> >(PairCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairCoulLongKokkos,HALFTHREAD,CoulLongTable<1> >(PairCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairCoulLongKokkos,CoulLongTable<1> >(PairCoulLongKokkos*,
+  friend struct PairComputeFunctor<PairCoulLongKokkos,FULL,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairCoulLongKokkos,FULL,true,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairCoulLongKokkos,HALF,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairCoulLongKokkos,HALFTHREAD,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairCoulLongKokkos,FULL,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairCoulLongKokkos,FULL,false,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairCoulLongKokkos,HALF,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairCoulLongKokkos,HALFTHREAD,false,0,CoulLongTable<1>>;
+  friend EV_FLOAT pair_compute_neighlist<PairCoulLongKokkos,FULL,0,CoulLongTable<1>>(PairCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairCoulLongKokkos,FULL,1,CoulLongTable<1>>(PairCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairCoulLongKokkos,HALF,0,CoulLongTable<1>>(PairCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairCoulLongKokkos,HALFTHREAD,0,CoulLongTable<1>>(PairCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairCoulLongKokkos,CoulLongTable<1>>(PairCoulLongKokkos*,
                                                             NeighListKokkos<DeviceType>*);
-  friend struct PairComputeFunctor<PairCoulLongKokkos,FULL,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairCoulLongKokkos,HALF,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairCoulLongKokkos,HALFTHREAD,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairCoulLongKokkos,FULL,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairCoulLongKokkos,HALF,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairCoulLongKokkos,HALFTHREAD,false,CoulLongTable<0> >;
-  friend EV_FLOAT pair_compute_neighlist<PairCoulLongKokkos,FULL,CoulLongTable<0> >(PairCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairCoulLongKokkos,HALF,CoulLongTable<0> >(PairCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairCoulLongKokkos,HALFTHREAD,CoulLongTable<0> >(PairCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairCoulLongKokkos,CoulLongTable<0> >(PairCoulLongKokkos*,
+  friend struct PairComputeFunctor<PairCoulLongKokkos,FULL,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairCoulLongKokkos,FULL,true,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairCoulLongKokkos,HALF,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairCoulLongKokkos,HALFTHREAD,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairCoulLongKokkos,FULL,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairCoulLongKokkos,FULL,false,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairCoulLongKokkos,HALF,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairCoulLongKokkos,HALFTHREAD,false,0,CoulLongTable<0>>;
+  friend EV_FLOAT pair_compute_neighlist<PairCoulLongKokkos,FULL,0,CoulLongTable<0>>(PairCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairCoulLongKokkos,FULL,1,CoulLongTable<0>>(PairCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairCoulLongKokkos,HALF,0,CoulLongTable<0>>(PairCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairCoulLongKokkos,HALFTHREAD,0,CoulLongTable<0>>(PairCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairCoulLongKokkos,CoulLongTable<0>>(PairCoulLongKokkos*,
                                                             NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairCoulLongKokkos>(PairCoulLongKokkos*);
 
diff --git a/src/KOKKOS/pair_eam_alloy_kokkos.cpp b/src/KOKKOS/pair_eam_alloy_kokkos.cpp
index 5cc6fa9443..0dfe56c365 100644
--- a/src/KOKKOS/pair_eam_alloy_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_alloy_kokkos.cpp
@@ -1477,7 +1477,7 @@ void PairEAMAlloyKokkos<DeviceType>::file2array_alloy()
 template<typename DeviceType>
 template<class TAG>
 struct PairEAMAlloyKokkos<DeviceType>::policyInstance {
-  KOKKOS_INLINE_FUNCTION
+
   static auto get(int inum) {
     auto policy = Kokkos::RangePolicy<DeviceType, TAG>(0,inum);
     return policy;
@@ -1488,7 +1488,7 @@ struct PairEAMAlloyKokkos<DeviceType>::policyInstance {
 template<>
 template<class TAG>
 struct PairEAMAlloyKokkos<Kokkos::Experimental::HIP>::policyInstance {
-  KOKKOS_INLINE_FUNCTION
+
   static auto get(int inum) {
     static_assert(t_ffloat_2d_n7::static_extent(2) == 7,
                   "Breaking assumption of spline dim for KernelAB and KernelC scratch caching");
diff --git a/src/KOKKOS/pair_eam_fs_kokkos.cpp b/src/KOKKOS/pair_eam_fs_kokkos.cpp
index 8e895dfeac..58ff615c04 100644
--- a/src/KOKKOS/pair_eam_fs_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_fs_kokkos.cpp
@@ -1487,7 +1487,7 @@ void PairEAMFSKokkos<DeviceType>::file2array_fs()
 template<typename DeviceType>
 template<class TAG>
 struct PairEAMFSKokkos<DeviceType>::policyInstance {
-  KOKKOS_INLINE_FUNCTION
+
   static auto get(int inum) {
     auto policy = Kokkos::RangePolicy<DeviceType, TAG>(0,inum);
     return policy;
@@ -1498,7 +1498,7 @@ struct PairEAMFSKokkos<DeviceType>::policyInstance {
 template<>
 template<class TAG>
 struct PairEAMFSKokkos<Kokkos::Experimental::HIP>::policyInstance {
-  KOKKOS_INLINE_FUNCTION
+
   static auto get(int inum) {
     static_assert(t_ffloat_2d_n7::static_extent(2) == 7,
                   "Breaking assumption of spline dim for KernelAB and KernelC scratch caching");
diff --git a/src/KOKKOS/pair_eam_kokkos.cpp b/src/KOKKOS/pair_eam_kokkos.cpp
index a3bc463bbf..864f736066 100644
--- a/src/KOKKOS/pair_eam_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_kokkos.cpp
@@ -1162,7 +1162,7 @@ void PairEAMKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const int &
 template<typename DeviceType>
 template<class TAG>
 struct PairEAMKokkos<DeviceType>::policyInstance {
-  KOKKOS_INLINE_FUNCTION
+
   static auto get(int inum) {
     auto policy = Kokkos::RangePolicy<DeviceType, TAG>(0,inum);
     return policy;
@@ -1173,7 +1173,7 @@ struct PairEAMKokkos<DeviceType>::policyInstance {
 template<>
 template<class TAG>
 struct PairEAMKokkos<Kokkos::Experimental::HIP>::policyInstance {
-  KOKKOS_INLINE_FUNCTION
+
   static auto get(int inum) {
     static_assert(t_ffloat_2d_n7::static_extent(2) == 7,
                   "Breaking assumption of spline dim for KernelAB and KernelC scratch caching");
diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h
index 2c2a622791..d3c766f5ae 100644
--- a/src/KOKKOS/pair_kokkos.h
+++ b/src/KOKKOS/pair_kokkos.h
@@ -50,7 +50,7 @@ struct DoCoul<1> {
 
 
 //Specialisation for Neighborlist types Half, HalfThread, Full
-template <class PairStyle, int NEIGHFLAG, bool STACKPARAMS, class Specialisation = void>
+template <class PairStyle, int NEIGHFLAG, bool STACKPARAMS, int ZEROFLAG = 0, class Specialisation = void>
 struct PairComputeFunctor  {
   typedef typename PairStyle::device_type device_type ;
   typedef ArrayTypes<device_type> AT;
@@ -137,7 +137,7 @@ struct PairComputeFunctor  {
     F_FLOAT fytmp = 0.0;
     F_FLOAT fztmp = 0.0;
 
-    if (NEIGHFLAG == FULL) {
+    if (NEIGHFLAG == FULL && ZEROFLAG) {
       f(i,0) = 0.0;
       f(i,1) = 0.0;
       f(i,2) = 0.0;
@@ -211,7 +211,7 @@ struct PairComputeFunctor  {
     F_FLOAT fytmp = 0.0;
     F_FLOAT fztmp = 0.0;
 
-    if (NEIGHFLAG == FULL) {
+    if (NEIGHFLAG == FULL && ZEROFLAG) {
       f(i,0) = 0.0;
       f(i,1) = 0.0;
       f(i,2) = 0.0;
@@ -292,11 +292,13 @@ struct PairComputeFunctor  {
       const X_FLOAT ztmp = c.x(i,2);
       const int itype = c.type(i);
 
-      Kokkos::single(Kokkos::PerThread(team), [&] (){
-        f(i,0) = 0.0;
-        f(i,1) = 0.0;
-        f(i,2) = 0.0;
-      });
+      if (ZEROFLAG) {
+        Kokkos::single(Kokkos::PerThread(team), [&] (){
+          f(i,0) = 0.0;
+          f(i,1) = 0.0;
+          f(i,2) = 0.0;
+        });
+      }
 
       const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i);
       const int jnum = list.d_numneigh[i];
@@ -355,11 +357,13 @@ struct PairComputeFunctor  {
       const int itype = c.type(i);
       const F_FLOAT qtmp = c.q(i);
 
-      Kokkos::single(Kokkos::PerThread(team), [&] (){
-        f(i,0) = 0.0;
-        f(i,1) = 0.0;
-        f(i,2) = 0.0;
-      });
+      if (ZEROFLAG) {
+        Kokkos::single(Kokkos::PerThread(team), [&] (){
+          f(i,0) = 0.0;
+          f(i,1) = 0.0;
+          f(i,2) = 0.0;
+        });
+      }
 
       const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i);
       const int jnum = list.d_numneigh[i];
@@ -423,11 +427,13 @@ struct PairComputeFunctor  {
       const X_FLOAT ztmp = c.x(i,2);
       const int itype = c.type(i);
 
-      Kokkos::single(Kokkos::PerThread(team), [&] (){
-        f(i,0) = 0.0;
-        f(i,1) = 0.0;
-        f(i,2) = 0.0;
-      });
+      if (ZEROFLAG) {
+        Kokkos::single(Kokkos::PerThread(team), [&] (){
+          f(i,0) = 0.0;
+          f(i,1) = 0.0;
+          f(i,2) = 0.0;
+        });
+      }
 
       const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i);
       const int jnum = list.d_numneigh[i];
@@ -525,11 +531,13 @@ struct PairComputeFunctor  {
       const int itype = c.type(i);
       const F_FLOAT qtmp = c.q(i);
 
-      Kokkos::single(Kokkos::PerThread(team), [&] (){
-        f(i,0) = 0.0;
-        f(i,1) = 0.0;
-        f(i,2) = 0.0;
-      });
+      if (ZEROFLAG) {
+        Kokkos::single(Kokkos::PerThread(team), [&] (){
+          f(i,0) = 0.0;
+          f(i,1) = 0.0;
+          f(i,2) = 0.0;
+        });
+      }
 
       const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i);
       const int jnum = list.d_numneigh[i];
@@ -740,7 +748,7 @@ struct PairComputeFunctor  {
 // By having the enable_if with a ! and without it, exactly one of the functions
 // pair_compute_neighlist will match - either the dummy version
 // or the real one further below.
-template<class PairStyle, unsigned NEIGHFLAG, class Specialisation>
+template<class PairStyle, unsigned NEIGHFLAG, int ZEROFLAG = 0, class Specialisation = void>
 EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<!((NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0), NeighListKokkos<typename PairStyle::device_type>*> list) {
   EV_FLOAT ev;
   (void) fpair;
@@ -770,7 +778,7 @@ int GetTeamSize(FunctorStyle& KOKKOS_GPU_ARG(functor), int KOKKOS_GPU_ARG(inum),
 }
 
 // Submit ParallelFor for NEIGHFLAG=HALF,HALFTHREAD,FULL
-template<class PairStyle, unsigned NEIGHFLAG, class Specialisation>
+template<class PairStyle, unsigned NEIGHFLAG, int ZEROFLAG = 0, class Specialisation = void>
 EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0, NeighListKokkos<typename PairStyle::device_type>*> list) {
   EV_FLOAT ev;
 
@@ -784,13 +792,13 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P
     int atoms_per_team = 32;
 
     if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
-      PairComputeFunctor<PairStyle,NEIGHFLAG,false,Specialisation > ff(fpair,list);
+      PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list);
       atoms_per_team = GetTeamSize<typename PairStyle::device_type>(ff, list->inum, (fpair->eflag || fpair->vflag), atoms_per_team, vector_length);
       Kokkos::TeamPolicy<typename PairStyle::device_type,Kokkos::IndexType<int> > policy(list->inum,atoms_per_team,vector_length);
       if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev);
       else                              Kokkos::parallel_for(policy,ff);
     } else {
-      PairComputeFunctor<PairStyle,NEIGHFLAG,true,Specialisation > ff(fpair,list);
+      PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list);
       atoms_per_team = GetTeamSize<typename PairStyle::device_type>(ff, list->inum, (fpair->eflag || fpair->vflag), atoms_per_team, vector_length);
       Kokkos::TeamPolicy<typename PairStyle::device_type,Kokkos::IndexType<int> > policy(list->inum,atoms_per_team,vector_length);
       if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev);
@@ -798,12 +806,12 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P
     }
   } else {
     if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
-      PairComputeFunctor<PairStyle,NEIGHFLAG,false,Specialisation > ff(fpair,list);
+      PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list);
       if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
       else                              Kokkos::parallel_for(list->inum,ff);
       ff.contribute();
     } else {
-      PairComputeFunctor<PairStyle,NEIGHFLAG,true,Specialisation > ff(fpair,list);
+      PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list);
       if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
       else                              Kokkos::parallel_for(list->inum,ff);
       ff.contribute();
@@ -812,16 +820,21 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P
   return ev;
 }
 
-template<class PairStyle, class Specialisation>
+template<class PairStyle, class Specialisation = void>
 EV_FLOAT pair_compute (PairStyle* fpair, NeighListKokkos<typename PairStyle::device_type>* list) {
   EV_FLOAT ev;
   if (fpair->neighflag == FULL) {
-    fpair->fuse_force_clear_flag = 1;
-    ev = pair_compute_neighlist<PairStyle,FULL,Specialisation> (fpair,list);
+    if (utils::strmatch(fpair->lmp->force->pair_style,"^hybrid/overlay")) {
+      fpair->fuse_force_clear_flag = 0;
+      ev = pair_compute_neighlist<PairStyle,FULL,0,Specialisation> (fpair,list);
+    } else {
+      fpair->fuse_force_clear_flag = 1;
+      ev = pair_compute_neighlist<PairStyle,FULL,1,Specialisation> (fpair,list);
+    }
   } else if (fpair->neighflag == HALFTHREAD) {
-    ev = pair_compute_neighlist<PairStyle,HALFTHREAD,Specialisation> (fpair,list);
+    ev = pair_compute_neighlist<PairStyle,HALFTHREAD,0,Specialisation> (fpair,list);
   } else if (fpair->neighflag == HALF) {
-    ev = pair_compute_neighlist<PairStyle,HALF,Specialisation> (fpair,list);
+    ev = pair_compute_neighlist<PairStyle,HALF,0,Specialisation> (fpair,list);
   }
   return ev;
 }
diff --git a/src/KOKKOS/pair_lj_charmm_coul_charmm_implicit_kokkos.h b/src/KOKKOS/pair_lj_charmm_coul_charmm_implicit_kokkos.h
index ae27ee68ab..7e21676fd5 100644
--- a/src/KOKKOS/pair_lj_charmm_coul_charmm_implicit_kokkos.h
+++ b/src/KOKKOS/pair_lj_charmm_coul_charmm_implicit_kokkos.h
@@ -110,27 +110,33 @@ class PairLJCharmmCoulCharmmImplicitKokkos : public PairLJCharmmCoulCharmmImplic
 
   void allocate() override;
 
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,FULL,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,HALF,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,HALFTHREAD,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,FULL,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,HALF,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,HALFTHREAD,false,CoulLongTable<1> >;
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmImplicitKokkos,FULL,CoulLongTable<1> >(PairLJCharmmCoulCharmmImplicitKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmImplicitKokkos,HALF,CoulLongTable<1> >(PairLJCharmmCoulCharmmImplicitKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmImplicitKokkos,HALFTHREAD,CoulLongTable<1> >(PairLJCharmmCoulCharmmImplicitKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJCharmmCoulCharmmImplicitKokkos,CoulLongTable<1> >(PairLJCharmmCoulCharmmImplicitKokkos*,
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,FULL,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,FULL,true,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,HALF,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,HALFTHREAD,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,FULL,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,FULL,false,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,HALF,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,HALFTHREAD,false,0,CoulLongTable<1>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmImplicitKokkos,FULL,0,CoulLongTable<1>>(PairLJCharmmCoulCharmmImplicitKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmImplicitKokkos,FULL,1,CoulLongTable<1>>(PairLJCharmmCoulCharmmImplicitKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmImplicitKokkos,HALF,0,CoulLongTable<1>>(PairLJCharmmCoulCharmmImplicitKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmImplicitKokkos,HALFTHREAD,0,CoulLongTable<1>>(PairLJCharmmCoulCharmmImplicitKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJCharmmCoulCharmmImplicitKokkos,CoulLongTable<1>>(PairLJCharmmCoulCharmmImplicitKokkos*,
                                                             NeighListKokkos<DeviceType>*);
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,FULL,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,HALF,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,HALFTHREAD,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,FULL,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,HALF,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,HALFTHREAD,false,CoulLongTable<0> >;
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmImplicitKokkos,FULL,CoulLongTable<0> >(PairLJCharmmCoulCharmmImplicitKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmImplicitKokkos,HALF,CoulLongTable<0> >(PairLJCharmmCoulCharmmImplicitKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmImplicitKokkos,HALFTHREAD,CoulLongTable<0> >(PairLJCharmmCoulCharmmImplicitKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJCharmmCoulCharmmImplicitKokkos,CoulLongTable<0> >(PairLJCharmmCoulCharmmImplicitKokkos*,
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,FULL,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,FULL,true,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,HALF,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,HALFTHREAD,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,FULL,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,FULL,false,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,HALF,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmImplicitKokkos,HALFTHREAD,false,0,CoulLongTable<0>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmImplicitKokkos,FULL,0,CoulLongTable<0>>(PairLJCharmmCoulCharmmImplicitKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmImplicitKokkos,FULL,1,CoulLongTable<0>>(PairLJCharmmCoulCharmmImplicitKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmImplicitKokkos,HALF,0,CoulLongTable<0>>(PairLJCharmmCoulCharmmImplicitKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmImplicitKokkos,HALFTHREAD,0,CoulLongTable<0>>(PairLJCharmmCoulCharmmImplicitKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJCharmmCoulCharmmImplicitKokkos,CoulLongTable<0>>(PairLJCharmmCoulCharmmImplicitKokkos*,
                                                             NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairLJCharmmCoulCharmmImplicitKokkos>(PairLJCharmmCoulCharmmImplicitKokkos*);
 
diff --git a/src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.h b/src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.h
index 912ad573c6..1f26242ded 100644
--- a/src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.h
+++ b/src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.h
@@ -108,27 +108,33 @@ class PairLJCharmmCoulCharmmKokkos : public PairLJCharmmCoulCharmm {
 
   void allocate() override;
 
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,FULL,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,HALF,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,HALFTHREAD,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,FULL,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,HALF,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,HALFTHREAD,false,CoulLongTable<1> >;
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmKokkos,FULL,CoulLongTable<1> >(PairLJCharmmCoulCharmmKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmKokkos,HALF,CoulLongTable<1> >(PairLJCharmmCoulCharmmKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmKokkos,HALFTHREAD,CoulLongTable<1> >(PairLJCharmmCoulCharmmKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJCharmmCoulCharmmKokkos,CoulLongTable<1> >(PairLJCharmmCoulCharmmKokkos*,
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,FULL,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,FULL,true,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,HALF,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,HALFTHREAD,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,FULL,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,FULL,false,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,HALF,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,HALFTHREAD,false,0,CoulLongTable<1>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmKokkos,FULL,0,CoulLongTable<1>>(PairLJCharmmCoulCharmmKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmKokkos,FULL,1,CoulLongTable<1>>(PairLJCharmmCoulCharmmKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmKokkos,HALF,0,CoulLongTable<1>>(PairLJCharmmCoulCharmmKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmKokkos,HALFTHREAD,0,CoulLongTable<1>>(PairLJCharmmCoulCharmmKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJCharmmCoulCharmmKokkos,CoulLongTable<1>>(PairLJCharmmCoulCharmmKokkos*,
                                                             NeighListKokkos<DeviceType>*);
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,FULL,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,HALF,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,HALFTHREAD,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,FULL,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,HALF,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,HALFTHREAD,false,CoulLongTable<0> >;
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmKokkos,FULL,CoulLongTable<0> >(PairLJCharmmCoulCharmmKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmKokkos,HALF,CoulLongTable<0> >(PairLJCharmmCoulCharmmKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmKokkos,HALFTHREAD,CoulLongTable<0> >(PairLJCharmmCoulCharmmKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJCharmmCoulCharmmKokkos,CoulLongTable<0> >(PairLJCharmmCoulCharmmKokkos*,
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,FULL,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,FULL,true,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,HALF,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,HALFTHREAD,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,FULL,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,FULL,false,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,HALF,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulCharmmKokkos,HALFTHREAD,false,0,CoulLongTable<0>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmKokkos,FULL,0,CoulLongTable<0>>(PairLJCharmmCoulCharmmKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmKokkos,FULL,1,CoulLongTable<0>>(PairLJCharmmCoulCharmmKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmKokkos,HALF,0,CoulLongTable<0>>(PairLJCharmmCoulCharmmKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulCharmmKokkos,HALFTHREAD,0,CoulLongTable<0>>(PairLJCharmmCoulCharmmKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJCharmmCoulCharmmKokkos,CoulLongTable<0>>(PairLJCharmmCoulCharmmKokkos*,
                                                             NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairLJCharmmCoulCharmmKokkos>(PairLJCharmmCoulCharmmKokkos*);
 
diff --git a/src/KOKKOS/pair_lj_charmm_coul_long_kokkos.h b/src/KOKKOS/pair_lj_charmm_coul_long_kokkos.h
index 4ae8a12944..c6c80e76dc 100644
--- a/src/KOKKOS/pair_lj_charmm_coul_long_kokkos.h
+++ b/src/KOKKOS/pair_lj_charmm_coul_long_kokkos.h
@@ -106,27 +106,33 @@ class PairLJCharmmCoulLongKokkos : public PairLJCharmmCoulLong {
 
   void allocate() override;
 
-  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,FULL,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,HALF,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,HALFTHREAD,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,FULL,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,HALF,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,HALFTHREAD,false,CoulLongTable<1> >;
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulLongKokkos,FULL,CoulLongTable<1> >(PairLJCharmmCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulLongKokkos,HALF,CoulLongTable<1> >(PairLJCharmmCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulLongKokkos,HALFTHREAD,CoulLongTable<1> >(PairLJCharmmCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJCharmmCoulLongKokkos,CoulLongTable<1> >(PairLJCharmmCoulLongKokkos*,
+  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,FULL,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,FULL,true,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,HALF,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,HALFTHREAD,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,FULL,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,FULL,false,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,HALF,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,HALFTHREAD,false,0,CoulLongTable<1>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulLongKokkos,FULL,0,CoulLongTable<1>>(PairLJCharmmCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulLongKokkos,FULL,1,CoulLongTable<1>>(PairLJCharmmCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulLongKokkos,HALF,0,CoulLongTable<1>>(PairLJCharmmCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulLongKokkos,HALFTHREAD,0,CoulLongTable<1>>(PairLJCharmmCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJCharmmCoulLongKokkos,CoulLongTable<1>>(PairLJCharmmCoulLongKokkos*,
                                                             NeighListKokkos<DeviceType>*);
-  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,FULL,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,HALF,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,HALFTHREAD,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,FULL,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,HALF,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,HALFTHREAD,false,CoulLongTable<0> >;
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulLongKokkos,FULL,CoulLongTable<0> >(PairLJCharmmCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulLongKokkos,HALF,CoulLongTable<0> >(PairLJCharmmCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulLongKokkos,HALFTHREAD,CoulLongTable<0> >(PairLJCharmmCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJCharmmCoulLongKokkos,CoulLongTable<0> >(PairLJCharmmCoulLongKokkos*,
+  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,FULL,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,FULL,true,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,HALF,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,HALFTHREAD,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,FULL,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,FULL,false,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,HALF,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmCoulLongKokkos,HALFTHREAD,false,0,CoulLongTable<0>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulLongKokkos,FULL,0,CoulLongTable<0>>(PairLJCharmmCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulLongKokkos,FULL,1,CoulLongTable<0>>(PairLJCharmmCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulLongKokkos,HALF,0,CoulLongTable<0>>(PairLJCharmmCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmCoulLongKokkos,HALFTHREAD,0,CoulLongTable<0>>(PairLJCharmmCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJCharmmCoulLongKokkos,CoulLongTable<0>>(PairLJCharmmCoulLongKokkos*,
                                                             NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairLJCharmmCoulLongKokkos>(PairLJCharmmCoulLongKokkos*);
 
diff --git a/src/KOKKOS/pair_lj_class2_coul_cut_kokkos.h b/src/KOKKOS/pair_lj_class2_coul_cut_kokkos.h
index 5ca276c28e..9399345458 100644
--- a/src/KOKKOS/pair_lj_class2_coul_cut_kokkos.h
+++ b/src/KOKKOS/pair_lj_class2_coul_cut_kokkos.h
@@ -104,15 +104,18 @@ class PairLJClass2CoulCutKokkos : public PairLJClass2CoulCut {
   double qqrd2e;
 
   void allocate() override;
-  friend struct PairComputeFunctor<PairLJClass2CoulCutKokkos,FULL,true>;
+  friend struct PairComputeFunctor<PairLJClass2CoulCutKokkos,FULL,true,0>;
+  friend struct PairComputeFunctor<PairLJClass2CoulCutKokkos,FULL,true,1>;
   friend struct PairComputeFunctor<PairLJClass2CoulCutKokkos,HALF,true>;
   friend struct PairComputeFunctor<PairLJClass2CoulCutKokkos,HALFTHREAD,true>;
-  friend struct PairComputeFunctor<PairLJClass2CoulCutKokkos,FULL,false>;
+  friend struct PairComputeFunctor<PairLJClass2CoulCutKokkos,FULL,false,0>;
+  friend struct PairComputeFunctor<PairLJClass2CoulCutKokkos,FULL,false,1>;
   friend struct PairComputeFunctor<PairLJClass2CoulCutKokkos,HALF,false>;
   friend struct PairComputeFunctor<PairLJClass2CoulCutKokkos,HALFTHREAD,false>;
-  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulCutKokkos,FULL,void>(PairLJClass2CoulCutKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulCutKokkos,HALF,void>(PairLJClass2CoulCutKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulCutKokkos,HALFTHREAD,void>(PairLJClass2CoulCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulCutKokkos,FULL,0>(PairLJClass2CoulCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulCutKokkos,FULL,1>(PairLJClass2CoulCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulCutKokkos,HALF>(PairLJClass2CoulCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulCutKokkos,HALFTHREAD>(PairLJClass2CoulCutKokkos*,NeighListKokkos<DeviceType>*);
   friend EV_FLOAT pair_compute<PairLJClass2CoulCutKokkos,void>(PairLJClass2CoulCutKokkos*,
                                                             NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairLJClass2CoulCutKokkos>(PairLJClass2CoulCutKokkos*);
diff --git a/src/KOKKOS/pair_lj_class2_coul_long_kokkos.h b/src/KOKKOS/pair_lj_class2_coul_long_kokkos.h
index 599cc2a83c..1cf6590855 100644
--- a/src/KOKKOS/pair_lj_class2_coul_long_kokkos.h
+++ b/src/KOKKOS/pair_lj_class2_coul_long_kokkos.h
@@ -107,27 +107,33 @@ class PairLJClass2CoulLongKokkos : public PairLJClass2CoulLong {
   double qqrd2e;
 
   void allocate() override;
-  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,FULL,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,HALF,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,HALFTHREAD,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,FULL,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,HALF,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,HALFTHREAD,false,CoulLongTable<1> >;
-  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulLongKokkos,FULL,CoulLongTable<1> >(PairLJClass2CoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulLongKokkos,HALF,CoulLongTable<1> >(PairLJClass2CoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulLongKokkos,HALFTHREAD,CoulLongTable<1> >(PairLJClass2CoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJClass2CoulLongKokkos,CoulLongTable<1> >(PairLJClass2CoulLongKokkos*,
+  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,FULL,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,FULL,true,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,HALF,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,HALFTHREAD,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,FULL,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,FULL,false,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,HALF,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,HALFTHREAD,false,0,CoulLongTable<1>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulLongKokkos,FULL,0,CoulLongTable<1>>(PairLJClass2CoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulLongKokkos,FULL,1,CoulLongTable<1>>(PairLJClass2CoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulLongKokkos,HALF,0,CoulLongTable<1>>(PairLJClass2CoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulLongKokkos,HALFTHREAD,0,CoulLongTable<1>>(PairLJClass2CoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJClass2CoulLongKokkos,CoulLongTable<1>>(PairLJClass2CoulLongKokkos*,
                                                             NeighListKokkos<DeviceType>*);
-  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,FULL,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,HALF,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,HALFTHREAD,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,FULL,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,HALF,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,HALFTHREAD,false,CoulLongTable<0> >;
-  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulLongKokkos,FULL,CoulLongTable<0> >(PairLJClass2CoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulLongKokkos,HALF,CoulLongTable<0> >(PairLJClass2CoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulLongKokkos,HALFTHREAD,CoulLongTable<0> >(PairLJClass2CoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJClass2CoulLongKokkos,CoulLongTable<0> >(PairLJClass2CoulLongKokkos*,
+  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,FULL,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,FULL,true,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,HALF,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,HALFTHREAD,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,FULL,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,FULL,false,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,HALF,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJClass2CoulLongKokkos,HALFTHREAD,false,0,CoulLongTable<0>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulLongKokkos,FULL,0,CoulLongTable<0>>(PairLJClass2CoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulLongKokkos,FULL,1,CoulLongTable<0>>(PairLJClass2CoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulLongKokkos,HALF,0,CoulLongTable<0>>(PairLJClass2CoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJClass2CoulLongKokkos,HALFTHREAD,0,CoulLongTable<0>>(PairLJClass2CoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJClass2CoulLongKokkos,CoulLongTable<0>>(PairLJClass2CoulLongKokkos*,
                                                             NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairLJClass2CoulLongKokkos>(PairLJClass2CoulLongKokkos*);
 
diff --git a/src/KOKKOS/pair_lj_class2_kokkos.h b/src/KOKKOS/pair_lj_class2_kokkos.h
index 0936399ca8..5594680929 100644
--- a/src/KOKKOS/pair_lj_class2_kokkos.h
+++ b/src/KOKKOS/pair_lj_class2_kokkos.h
@@ -96,16 +96,19 @@ class PairLJClass2Kokkos : public PairLJClass2 {
   int nlocal,nall,eflag,vflag;
 
   void allocate() override;
-  friend struct PairComputeFunctor<PairLJClass2Kokkos,FULL,true>;
+  friend struct PairComputeFunctor<PairLJClass2Kokkos,FULL,true,0>;
+  friend struct PairComputeFunctor<PairLJClass2Kokkos,FULL,true,1>;
   friend struct PairComputeFunctor<PairLJClass2Kokkos,HALF,true>;
   friend struct PairComputeFunctor<PairLJClass2Kokkos,HALFTHREAD,true>;
-  friend struct PairComputeFunctor<PairLJClass2Kokkos,FULL,false>;
+  friend struct PairComputeFunctor<PairLJClass2Kokkos,FULL,false,0>;
+  friend struct PairComputeFunctor<PairLJClass2Kokkos,FULL,false,1>;
   friend struct PairComputeFunctor<PairLJClass2Kokkos,HALF,false>;
   friend struct PairComputeFunctor<PairLJClass2Kokkos,HALFTHREAD,false>;
-  friend EV_FLOAT pair_compute_neighlist<PairLJClass2Kokkos,FULL,void>(PairLJClass2Kokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJClass2Kokkos,HALF,void>(PairLJClass2Kokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJClass2Kokkos,HALFTHREAD,void>(PairLJClass2Kokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJClass2Kokkos,void>(PairLJClass2Kokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJClass2Kokkos,FULL,0>(PairLJClass2Kokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJClass2Kokkos,FULL,1>(PairLJClass2Kokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJClass2Kokkos,HALF>(PairLJClass2Kokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJClass2Kokkos,HALFTHREAD>(PairLJClass2Kokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJClass2Kokkos>(PairLJClass2Kokkos*,NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairLJClass2Kokkos>(PairLJClass2Kokkos*);
 };
 
diff --git a/src/KOKKOS/pair_lj_cut_coul_cut_kokkos.h b/src/KOKKOS/pair_lj_cut_coul_cut_kokkos.h
index 87464b37dc..affc67bf16 100644
--- a/src/KOKKOS/pair_lj_cut_coul_cut_kokkos.h
+++ b/src/KOKKOS/pair_lj_cut_coul_cut_kokkos.h
@@ -104,15 +104,18 @@ class PairLJCutCoulCutKokkos : public PairLJCutCoulCut {
   double qqrd2e;
 
   void allocate() override;
-  friend struct PairComputeFunctor<PairLJCutCoulCutKokkos,FULL,true>;
+  friend struct PairComputeFunctor<PairLJCutCoulCutKokkos,FULL,true,0>;
+  friend struct PairComputeFunctor<PairLJCutCoulCutKokkos,FULL,true,1>;
   friend struct PairComputeFunctor<PairLJCutCoulCutKokkos,HALF,true>;
   friend struct PairComputeFunctor<PairLJCutCoulCutKokkos,HALFTHREAD,true>;
-  friend struct PairComputeFunctor<PairLJCutCoulCutKokkos,FULL,false>;
+  friend struct PairComputeFunctor<PairLJCutCoulCutKokkos,FULL,false,0>;
+  friend struct PairComputeFunctor<PairLJCutCoulCutKokkos,FULL,false,1>;
   friend struct PairComputeFunctor<PairLJCutCoulCutKokkos,HALF,false>;
   friend struct PairComputeFunctor<PairLJCutCoulCutKokkos,HALFTHREAD,false>;
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulCutKokkos,FULL,void>(PairLJCutCoulCutKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulCutKokkos,HALF,void>(PairLJCutCoulCutKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulCutKokkos,HALFTHREAD,void>(PairLJCutCoulCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulCutKokkos,FULL,0>(PairLJCutCoulCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulCutKokkos,FULL,1>(PairLJCutCoulCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulCutKokkos,HALF>(PairLJCutCoulCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulCutKokkos,HALFTHREAD>(PairLJCutCoulCutKokkos*,NeighListKokkos<DeviceType>*);
   friend EV_FLOAT pair_compute<PairLJCutCoulCutKokkos,void>(PairLJCutCoulCutKokkos*,
                                                             NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairLJCutCoulCutKokkos>(PairLJCutCoulCutKokkos*);
diff --git a/src/KOKKOS/pair_lj_cut_coul_debye_kokkos.h b/src/KOKKOS/pair_lj_cut_coul_debye_kokkos.h
index ea0b401959..eeed483b76 100644
--- a/src/KOKKOS/pair_lj_cut_coul_debye_kokkos.h
+++ b/src/KOKKOS/pair_lj_cut_coul_debye_kokkos.h
@@ -104,15 +104,18 @@ class PairLJCutCoulDebyeKokkos : public PairLJCutCoulDebye {
   double qqrd2e;
 
   void allocate() override;
-  friend struct PairComputeFunctor<PairLJCutCoulDebyeKokkos,FULL,true>;
+  friend struct PairComputeFunctor<PairLJCutCoulDebyeKokkos,FULL,true,0>;
+  friend struct PairComputeFunctor<PairLJCutCoulDebyeKokkos,FULL,true,1>;
   friend struct PairComputeFunctor<PairLJCutCoulDebyeKokkos,HALF,true>;
   friend struct PairComputeFunctor<PairLJCutCoulDebyeKokkos,HALFTHREAD,true>;
-  friend struct PairComputeFunctor<PairLJCutCoulDebyeKokkos,FULL,false>;
+  friend struct PairComputeFunctor<PairLJCutCoulDebyeKokkos,FULL,false,0>;
+  friend struct PairComputeFunctor<PairLJCutCoulDebyeKokkos,FULL,false,1>;
   friend struct PairComputeFunctor<PairLJCutCoulDebyeKokkos,HALF,false>;
   friend struct PairComputeFunctor<PairLJCutCoulDebyeKokkos,HALFTHREAD,false>;
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulDebyeKokkos,FULL,void>(PairLJCutCoulDebyeKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulDebyeKokkos,HALF,void>(PairLJCutCoulDebyeKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulDebyeKokkos,HALFTHREAD,void>(PairLJCutCoulDebyeKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulDebyeKokkos,FULL,0>(PairLJCutCoulDebyeKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulDebyeKokkos,FULL,1>(PairLJCutCoulDebyeKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulDebyeKokkos,HALF>(PairLJCutCoulDebyeKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulDebyeKokkos,HALFTHREAD>(PairLJCutCoulDebyeKokkos*,NeighListKokkos<DeviceType>*);
   friend EV_FLOAT pair_compute<PairLJCutCoulDebyeKokkos,void>(PairLJCutCoulDebyeKokkos*,
                                                             NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairLJCutCoulDebyeKokkos>(PairLJCutCoulDebyeKokkos*);
diff --git a/src/KOKKOS/pair_lj_cut_coul_dsf_kokkos.h b/src/KOKKOS/pair_lj_cut_coul_dsf_kokkos.h
index e420bd22a9..d9e5fcfe49 100644
--- a/src/KOKKOS/pair_lj_cut_coul_dsf_kokkos.h
+++ b/src/KOKKOS/pair_lj_cut_coul_dsf_kokkos.h
@@ -101,15 +101,18 @@ class PairLJCutCoulDSFKokkos : public PairLJCutCoulDSF {
   double qqrd2e;
 
   void allocate() override;
-  friend struct PairComputeFunctor<PairLJCutCoulDSFKokkos,FULL,true>;
+  friend struct PairComputeFunctor<PairLJCutCoulDSFKokkos,FULL,true,0>;
+  friend struct PairComputeFunctor<PairLJCutCoulDSFKokkos,FULL,true,1>;
   friend struct PairComputeFunctor<PairLJCutCoulDSFKokkos,HALF,true>;
   friend struct PairComputeFunctor<PairLJCutCoulDSFKokkos,HALFTHREAD,true>;
-  friend struct PairComputeFunctor<PairLJCutCoulDSFKokkos,FULL,false>;
+  friend struct PairComputeFunctor<PairLJCutCoulDSFKokkos,FULL,false,0>;
+  friend struct PairComputeFunctor<PairLJCutCoulDSFKokkos,FULL,false,1>;
   friend struct PairComputeFunctor<PairLJCutCoulDSFKokkos,HALF,false>;
   friend struct PairComputeFunctor<PairLJCutCoulDSFKokkos,HALFTHREAD,false>;
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulDSFKokkos,FULL,void>(PairLJCutCoulDSFKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulDSFKokkos,HALF,void>(PairLJCutCoulDSFKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulDSFKokkos,HALFTHREAD,void>(PairLJCutCoulDSFKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulDSFKokkos,FULL,0>(PairLJCutCoulDSFKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulDSFKokkos,FULL,1>(PairLJCutCoulDSFKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulDSFKokkos,HALF>(PairLJCutCoulDSFKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulDSFKokkos,HALFTHREAD>(PairLJCutCoulDSFKokkos*,NeighListKokkos<DeviceType>*);
   friend EV_FLOAT pair_compute<PairLJCutCoulDSFKokkos,void>(PairLJCutCoulDSFKokkos*,
                                                             NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairLJCutCoulDSFKokkos>(PairLJCutCoulDSFKokkos*);
diff --git a/src/KOKKOS/pair_lj_cut_coul_long_kokkos.h b/src/KOKKOS/pair_lj_cut_coul_long_kokkos.h
index bcb97a59cd..ec6e2db176 100644
--- a/src/KOKKOS/pair_lj_cut_coul_long_kokkos.h
+++ b/src/KOKKOS/pair_lj_cut_coul_long_kokkos.h
@@ -107,27 +107,33 @@ class PairLJCutCoulLongKokkos : public PairLJCutCoulLong {
   double qqrd2e;
 
   void allocate() override;
-  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,FULL,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,HALF,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,HALFTHREAD,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,FULL,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,HALF,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,HALFTHREAD,false,CoulLongTable<1> >;
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulLongKokkos,FULL,CoulLongTable<1> >(PairLJCutCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulLongKokkos,HALF,CoulLongTable<1> >(PairLJCutCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulLongKokkos,HALFTHREAD,CoulLongTable<1> >(PairLJCutCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJCutCoulLongKokkos,CoulLongTable<1> >(PairLJCutCoulLongKokkos*,
+  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,FULL,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,FULL,true,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,HALF,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,HALFTHREAD,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,FULL,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,FULL,false,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,HALF,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,HALFTHREAD,false,0,CoulLongTable<1>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulLongKokkos,FULL,0,CoulLongTable<1>>(PairLJCutCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulLongKokkos,FULL,1,CoulLongTable<1>>(PairLJCutCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulLongKokkos,HALF,0,CoulLongTable<1>>(PairLJCutCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulLongKokkos,HALFTHREAD,0,CoulLongTable<1>>(PairLJCutCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJCutCoulLongKokkos,CoulLongTable<1>>(PairLJCutCoulLongKokkos*,
                                                             NeighListKokkos<DeviceType>*);
-  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,FULL,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,HALF,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,HALFTHREAD,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,FULL,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,HALF,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,HALFTHREAD,false,CoulLongTable<0> >;
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulLongKokkos,FULL,CoulLongTable<0> >(PairLJCutCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulLongKokkos,HALF,CoulLongTable<0> >(PairLJCutCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulLongKokkos,HALFTHREAD,CoulLongTable<0> >(PairLJCutCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJCutCoulLongKokkos,CoulLongTable<0> >(PairLJCutCoulLongKokkos*,
+  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,FULL,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,FULL,true,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,HALF,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,HALFTHREAD,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,FULL,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,FULL,false,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,HALF,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCutCoulLongKokkos,HALFTHREAD,false,0,CoulLongTable<0>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulLongKokkos,FULL,0,CoulLongTable<0>>(PairLJCutCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulLongKokkos,FULL,1,CoulLongTable<0>>(PairLJCutCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulLongKokkos,HALF,0,CoulLongTable<0>>(PairLJCutCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutCoulLongKokkos,HALFTHREAD,0,CoulLongTable<0>>(PairLJCutCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJCutCoulLongKokkos,CoulLongTable<0>>(PairLJCutCoulLongKokkos*,
                                                             NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairLJCutCoulLongKokkos>(PairLJCutCoulLongKokkos*);
 
diff --git a/src/KOKKOS/pair_lj_cut_kokkos.h b/src/KOKKOS/pair_lj_cut_kokkos.h
index 106f1a9048..b44c1aa6fe 100644
--- a/src/KOKKOS/pair_lj_cut_kokkos.h
+++ b/src/KOKKOS/pair_lj_cut_kokkos.h
@@ -92,16 +92,19 @@ class PairLJCutKokkos : public PairLJCut {
   int nlocal,nall,eflag,vflag;
 
   void allocate() override;
-  friend struct PairComputeFunctor<PairLJCutKokkos,FULL,true>;
+  friend struct PairComputeFunctor<PairLJCutKokkos,FULL,true,0>;
+  friend struct PairComputeFunctor<PairLJCutKokkos,FULL,true,1>;
   friend struct PairComputeFunctor<PairLJCutKokkos,HALF,true>;
   friend struct PairComputeFunctor<PairLJCutKokkos,HALFTHREAD,true>;
-  friend struct PairComputeFunctor<PairLJCutKokkos,FULL,false>;
+  friend struct PairComputeFunctor<PairLJCutKokkos,FULL,false,0>;
+  friend struct PairComputeFunctor<PairLJCutKokkos,FULL,false,1>;
   friend struct PairComputeFunctor<PairLJCutKokkos,HALF,false>;
   friend struct PairComputeFunctor<PairLJCutKokkos,HALFTHREAD,false>;
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutKokkos,FULL,void>(PairLJCutKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutKokkos,HALF,void>(PairLJCutKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJCutKokkos,HALFTHREAD,void>(PairLJCutKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJCutKokkos,void>(PairLJCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutKokkos,FULL,0>(PairLJCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutKokkos,FULL,1>(PairLJCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutKokkos,HALF>(PairLJCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCutKokkos,HALFTHREAD>(PairLJCutKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJCutKokkos>(PairLJCutKokkos*,NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairLJCutKokkos>(PairLJCutKokkos*);
 };
 
diff --git a/src/KOKKOS/pair_lj_expand_coul_long_kokkos.h b/src/KOKKOS/pair_lj_expand_coul_long_kokkos.h
index 09a694a122..30e82b7dab 100644
--- a/src/KOKKOS/pair_lj_expand_coul_long_kokkos.h
+++ b/src/KOKKOS/pair_lj_expand_coul_long_kokkos.h
@@ -116,27 +116,33 @@ class PairLJExpandCoulLongKokkos : public PairLJExpandCoulLong {
   double qqrd2e;
 
   void allocate() override;
-  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,FULL,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,HALF,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,HALFTHREAD,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,FULL,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,HALF,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,HALFTHREAD,false,CoulLongTable<1> >;
-  friend EV_FLOAT pair_compute_neighlist<PairLJExpandCoulLongKokkos,FULL,CoulLongTable<1> >(PairLJExpandCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJExpandCoulLongKokkos,HALF,CoulLongTable<1> >(PairLJExpandCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJExpandCoulLongKokkos,HALFTHREAD,CoulLongTable<1> >(PairLJExpandCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJExpandCoulLongKokkos,CoulLongTable<1> >(PairLJExpandCoulLongKokkos*,
+  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,FULL,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,FULL,true,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,HALF,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,HALFTHREAD,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,FULL,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,FULL,false,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,HALF,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,HALFTHREAD,false,0,CoulLongTable<1>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJExpandCoulLongKokkos,FULL,0,CoulLongTable<1>>(PairLJExpandCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJExpandCoulLongKokkos,FULL,1,CoulLongTable<1>>(PairLJExpandCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJExpandCoulLongKokkos,HALF,0,CoulLongTable<1>>(PairLJExpandCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJExpandCoulLongKokkos,HALFTHREAD,0,CoulLongTable<1>>(PairLJExpandCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJExpandCoulLongKokkos,CoulLongTable<1>>(PairLJExpandCoulLongKokkos*,
                                                             NeighListKokkos<DeviceType>*);
-  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,FULL,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,HALF,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,HALFTHREAD,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,FULL,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,HALF,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,HALFTHREAD,false,CoulLongTable<0> >;
-  friend EV_FLOAT pair_compute_neighlist<PairLJExpandCoulLongKokkos,FULL,CoulLongTable<0> >(PairLJExpandCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJExpandCoulLongKokkos,HALF,CoulLongTable<0> >(PairLJExpandCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJExpandCoulLongKokkos,HALFTHREAD,CoulLongTable<0> >(PairLJExpandCoulLongKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJExpandCoulLongKokkos,CoulLongTable<0> >(PairLJExpandCoulLongKokkos*,
+  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,FULL,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,FULL,true,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,HALF,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,HALFTHREAD,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,FULL,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,FULL,false,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,HALF,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJExpandCoulLongKokkos,HALFTHREAD,false,0,CoulLongTable<0>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJExpandCoulLongKokkos,FULL,0,CoulLongTable<0>>(PairLJExpandCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJExpandCoulLongKokkos,FULL,1,CoulLongTable<0>>(PairLJExpandCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJExpandCoulLongKokkos,HALF,0,CoulLongTable<0>>(PairLJExpandCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJExpandCoulLongKokkos,HALFTHREAD,0,CoulLongTable<0>>(PairLJExpandCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJExpandCoulLongKokkos,CoulLongTable<0>>(PairLJExpandCoulLongKokkos*,
                                                             NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairLJExpandCoulLongKokkos>(PairLJExpandCoulLongKokkos*);
 };
diff --git a/src/KOKKOS/pair_lj_expand_kokkos.h b/src/KOKKOS/pair_lj_expand_kokkos.h
index 0df0a6f8f8..64fe7d8b8e 100644
--- a/src/KOKKOS/pair_lj_expand_kokkos.h
+++ b/src/KOKKOS/pair_lj_expand_kokkos.h
@@ -97,16 +97,19 @@ class PairLJExpandKokkos : public PairLJExpand {
   int nlocal,nall,eflag,vflag;
 
   void allocate() override;
-  friend struct PairComputeFunctor<PairLJExpandKokkos,FULL,true>;
+  friend struct PairComputeFunctor<PairLJExpandKokkos,FULL,true,0>;
+  friend struct PairComputeFunctor<PairLJExpandKokkos,FULL,true,1>;
   friend struct PairComputeFunctor<PairLJExpandKokkos,HALF,true>;
   friend struct PairComputeFunctor<PairLJExpandKokkos,HALFTHREAD,true>;
-  friend struct PairComputeFunctor<PairLJExpandKokkos,FULL,false>;
+  friend struct PairComputeFunctor<PairLJExpandKokkos,FULL,false,0>;
+  friend struct PairComputeFunctor<PairLJExpandKokkos,FULL,false,1>;
   friend struct PairComputeFunctor<PairLJExpandKokkos,HALF,false>;
   friend struct PairComputeFunctor<PairLJExpandKokkos,HALFTHREAD,false>;
-  friend EV_FLOAT pair_compute_neighlist<PairLJExpandKokkos,FULL,void>(PairLJExpandKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJExpandKokkos,HALF,void>(PairLJExpandKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJExpandKokkos,HALFTHREAD,void>(PairLJExpandKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJExpandKokkos,void>(PairLJExpandKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJExpandKokkos,FULL,0>(PairLJExpandKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJExpandKokkos,FULL,1>(PairLJExpandKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJExpandKokkos,HALF>(PairLJExpandKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJExpandKokkos,HALFTHREAD>(PairLJExpandKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJExpandKokkos>(PairLJExpandKokkos*,NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairLJExpandKokkos>(PairLJExpandKokkos*);
 };
 
diff --git a/src/KOKKOS/pair_lj_gromacs_coul_gromacs_kokkos.h b/src/KOKKOS/pair_lj_gromacs_coul_gromacs_kokkos.h
index 359c4a1229..020b621e33 100644
--- a/src/KOKKOS/pair_lj_gromacs_coul_gromacs_kokkos.h
+++ b/src/KOKKOS/pair_lj_gromacs_coul_gromacs_kokkos.h
@@ -115,27 +115,33 @@ class PairLJGromacsCoulGromacsKokkos : public PairLJGromacsCoulGromacs {
 
   void allocate() override;
 
-  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,FULL,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,HALF,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,HALFTHREAD,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,FULL,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,HALF,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,HALFTHREAD,false,CoulLongTable<1> >;
-  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsCoulGromacsKokkos,FULL,CoulLongTable<1> >(PairLJGromacsCoulGromacsKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsCoulGromacsKokkos,HALF,CoulLongTable<1> >(PairLJGromacsCoulGromacsKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsCoulGromacsKokkos,HALFTHREAD,CoulLongTable<1> >(PairLJGromacsCoulGromacsKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJGromacsCoulGromacsKokkos,CoulLongTable<1> >(PairLJGromacsCoulGromacsKokkos*,
+  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,FULL,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,FULL,true,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,HALF,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,HALFTHREAD,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,FULL,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,FULL,false,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,HALF,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,HALFTHREAD,false,0,CoulLongTable<1>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsCoulGromacsKokkos,FULL,0,CoulLongTable<1>>(PairLJGromacsCoulGromacsKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsCoulGromacsKokkos,FULL,1,CoulLongTable<1>>(PairLJGromacsCoulGromacsKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsCoulGromacsKokkos,HALF,0,CoulLongTable<1>>(PairLJGromacsCoulGromacsKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsCoulGromacsKokkos,HALFTHREAD,0,CoulLongTable<1>>(PairLJGromacsCoulGromacsKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJGromacsCoulGromacsKokkos,CoulLongTable<1>>(PairLJGromacsCoulGromacsKokkos*,
                                                             NeighListKokkos<DeviceType>*);
-  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,FULL,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,HALF,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,HALFTHREAD,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,FULL,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,HALF,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,HALFTHREAD,false,CoulLongTable<0> >;
-  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsCoulGromacsKokkos,FULL,CoulLongTable<0> >(PairLJGromacsCoulGromacsKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsCoulGromacsKokkos,HALF,CoulLongTable<0> >(PairLJGromacsCoulGromacsKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsCoulGromacsKokkos,HALFTHREAD,CoulLongTable<0> >(PairLJGromacsCoulGromacsKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJGromacsCoulGromacsKokkos,CoulLongTable<0> >(PairLJGromacsCoulGromacsKokkos*,
+  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,FULL,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,FULL,true,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,HALF,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,HALFTHREAD,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,FULL,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,FULL,false,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,HALF,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJGromacsCoulGromacsKokkos,HALFTHREAD,false,0,CoulLongTable<0>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsCoulGromacsKokkos,FULL,0,CoulLongTable<0>>(PairLJGromacsCoulGromacsKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsCoulGromacsKokkos,FULL,1,CoulLongTable<0>>(PairLJGromacsCoulGromacsKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsCoulGromacsKokkos,HALF,0,CoulLongTable<0>>(PairLJGromacsCoulGromacsKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsCoulGromacsKokkos,HALFTHREAD,0,CoulLongTable<0>>(PairLJGromacsCoulGromacsKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJGromacsCoulGromacsKokkos,CoulLongTable<0>>(PairLJGromacsCoulGromacsKokkos*,
                                                             NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairLJGromacsCoulGromacsKokkos>(PairLJGromacsCoulGromacsKokkos*);
 
diff --git a/src/KOKKOS/pair_lj_gromacs_kokkos.h b/src/KOKKOS/pair_lj_gromacs_kokkos.h
index 95c600a415..ad41ca5120 100644
--- a/src/KOKKOS/pair_lj_gromacs_kokkos.h
+++ b/src/KOKKOS/pair_lj_gromacs_kokkos.h
@@ -115,27 +115,33 @@ class PairLJGromacsKokkos : public PairLJGromacs {
 
   void allocate() override;
 
-  friend struct PairComputeFunctor<PairLJGromacsKokkos,FULL,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJGromacsKokkos,HALF,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJGromacsKokkos,HALFTHREAD,true,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJGromacsKokkos,FULL,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJGromacsKokkos,HALF,false,CoulLongTable<1> >;
-  friend struct PairComputeFunctor<PairLJGromacsKokkos,HALFTHREAD,false,CoulLongTable<1> >;
-  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsKokkos,FULL,CoulLongTable<1> >(PairLJGromacsKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsKokkos,HALF,CoulLongTable<1> >(PairLJGromacsKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsKokkos,HALFTHREAD,CoulLongTable<1> >(PairLJGromacsKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJGromacsKokkos,CoulLongTable<1> >(PairLJGromacsKokkos*,
+  friend struct PairComputeFunctor<PairLJGromacsKokkos,FULL,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJGromacsKokkos,FULL,true,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJGromacsKokkos,HALF,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJGromacsKokkos,HALFTHREAD,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJGromacsKokkos,FULL,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJGromacsKokkos,FULL,false,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJGromacsKokkos,HALF,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJGromacsKokkos,HALFTHREAD,false,0,CoulLongTable<1>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsKokkos,FULL,0,CoulLongTable<1>>(PairLJGromacsKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsKokkos,FULL,1,CoulLongTable<1>>(PairLJGromacsKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsKokkos,HALF,0,CoulLongTable<1>>(PairLJGromacsKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsKokkos,HALFTHREAD,0,CoulLongTable<1>>(PairLJGromacsKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJGromacsKokkos,CoulLongTable<1>>(PairLJGromacsKokkos*,
                                                             NeighListKokkos<DeviceType>*);
-  friend struct PairComputeFunctor<PairLJGromacsKokkos,FULL,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJGromacsKokkos,HALF,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJGromacsKokkos,HALFTHREAD,true,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJGromacsKokkos,FULL,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJGromacsKokkos,HALF,false,CoulLongTable<0> >;
-  friend struct PairComputeFunctor<PairLJGromacsKokkos,HALFTHREAD,false,CoulLongTable<0> >;
-  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsKokkos,FULL,CoulLongTable<0> >(PairLJGromacsKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsKokkos,HALF,CoulLongTable<0> >(PairLJGromacsKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsKokkos,HALFTHREAD,CoulLongTable<0> >(PairLJGromacsKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJGromacsKokkos,CoulLongTable<0> >(PairLJGromacsKokkos*,
+  friend struct PairComputeFunctor<PairLJGromacsKokkos,FULL,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJGromacsKokkos,FULL,true,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJGromacsKokkos,HALF,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJGromacsKokkos,HALFTHREAD,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJGromacsKokkos,FULL,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJGromacsKokkos,FULL,false,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJGromacsKokkos,HALF,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJGromacsKokkos,HALFTHREAD,false,0,CoulLongTable<0>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsKokkos,FULL,0,CoulLongTable<0>>(PairLJGromacsKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsKokkos,FULL,1,CoulLongTable<0>>(PairLJGromacsKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsKokkos,HALF,0,CoulLongTable<0>>(PairLJGromacsKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJGromacsKokkos,HALFTHREAD,0,CoulLongTable<0>>(PairLJGromacsKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJGromacsKokkos,CoulLongTable<0>>(PairLJGromacsKokkos*,
                                                             NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairLJGromacsKokkos>(PairLJGromacsKokkos*);
 
diff --git a/src/KOKKOS/pair_lj_spica_kokkos.h b/src/KOKKOS/pair_lj_spica_kokkos.h
index b330af4bfd..06c70ebd3e 100644
--- a/src/KOKKOS/pair_lj_spica_kokkos.h
+++ b/src/KOKKOS/pair_lj_spica_kokkos.h
@@ -97,16 +97,19 @@ class PairLJSPICAKokkos : public PairLJSPICA {
   int nlocal,nall,eflag,vflag;
 
   void allocate() override;
-  friend struct PairComputeFunctor<PairLJSPICAKokkos,FULL,true>;
+  friend struct PairComputeFunctor<PairLJSPICAKokkos,FULL,true,0>;
+  friend struct PairComputeFunctor<PairLJSPICAKokkos,FULL,true,1>;
   friend struct PairComputeFunctor<PairLJSPICAKokkos,HALF,true>;
   friend struct PairComputeFunctor<PairLJSPICAKokkos,HALFTHREAD,true>;
-  friend struct PairComputeFunctor<PairLJSPICAKokkos,FULL,false>;
+  friend struct PairComputeFunctor<PairLJSPICAKokkos,FULL,false,0>;
+  friend struct PairComputeFunctor<PairLJSPICAKokkos,FULL,false,1>;
   friend struct PairComputeFunctor<PairLJSPICAKokkos,HALF,false>;
   friend struct PairComputeFunctor<PairLJSPICAKokkos,HALFTHREAD,false>;
-  friend EV_FLOAT pair_compute_neighlist<PairLJSPICAKokkos,FULL,void>(PairLJSPICAKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJSPICAKokkos,HALF,void>(PairLJSPICAKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairLJSPICAKokkos,HALFTHREAD,void>(PairLJSPICAKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairLJSPICAKokkos,void>(PairLJSPICAKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJSPICAKokkos,FULL,0>(PairLJSPICAKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJSPICAKokkos,FULL,1>(PairLJSPICAKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJSPICAKokkos,HALF>(PairLJSPICAKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJSPICAKokkos,HALFTHREAD>(PairLJSPICAKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJSPICAKokkos>(PairLJSPICAKokkos*,NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairLJSPICAKokkos>(PairLJSPICAKokkos*);
 };
 
diff --git a/src/KOKKOS/pair_morse_kokkos.h b/src/KOKKOS/pair_morse_kokkos.h
index d06cf2deb1..ccf27b018b 100644
--- a/src/KOKKOS/pair_morse_kokkos.h
+++ b/src/KOKKOS/pair_morse_kokkos.h
@@ -92,16 +92,19 @@ class PairMorseKokkos : public PairMorse {
   int nlocal,nall,eflag,vflag;
 
   void allocate() override;
-  friend struct PairComputeFunctor<PairMorseKokkos,FULL,true>;
+  friend struct PairComputeFunctor<PairMorseKokkos,FULL,true,0>;
+  friend struct PairComputeFunctor<PairMorseKokkos,FULL,true,1>;
   friend struct PairComputeFunctor<PairMorseKokkos,HALF,true>;
   friend struct PairComputeFunctor<PairMorseKokkos,HALFTHREAD,true>;
-  friend struct PairComputeFunctor<PairMorseKokkos,FULL,false>;
+  friend struct PairComputeFunctor<PairMorseKokkos,FULL,false,0>;
+  friend struct PairComputeFunctor<PairMorseKokkos,FULL,false,1>;
   friend struct PairComputeFunctor<PairMorseKokkos,HALF,false>;
   friend struct PairComputeFunctor<PairMorseKokkos,HALFTHREAD,false>;
-  friend EV_FLOAT pair_compute_neighlist<PairMorseKokkos,FULL,void>(PairMorseKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairMorseKokkos,HALF,void>(PairMorseKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairMorseKokkos,HALFTHREAD,void>(PairMorseKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairMorseKokkos,void>(PairMorseKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairMorseKokkos,FULL,0>(PairMorseKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairMorseKokkos,FULL,1>(PairMorseKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairMorseKokkos,HALF>(PairMorseKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairMorseKokkos,HALFTHREAD>(PairMorseKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairMorseKokkos>(PairMorseKokkos*,NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairMorseKokkos>(PairMorseKokkos*);
 };
 
diff --git a/src/KOKKOS/pair_pace_kokkos.cpp b/src/KOKKOS/pair_pace_kokkos.cpp
index 56a6656d78..153a6d0333 100644
--- a/src/KOKKOS/pair_pace_kokkos.cpp
+++ b/src/KOKKOS/pair_pace_kokkos.cpp
@@ -237,6 +237,9 @@ void PairPACEKokkos<DeviceType>::copy_splines()
 
   ACERadialFunctions* radial_functions = dynamic_cast<ACERadialFunctions*>(basis_set->radial_functions);
 
+  if (radial_functions == nullptr)
+    error->all(FLERR,"Chosen radial basis style not supported by pair style pace/kk");
+
   for (int i = 0; i < nelements; i++) {
     for (int j = 0; j < nelements; j++) {
       k_splines_gk.h_view(i, j) = radial_functions->splines_gk(i, j);
diff --git a/src/KOKKOS/pair_table_kokkos.cpp b/src/KOKKOS/pair_table_kokkos.cpp
index 83bd74d4af..99d01be4a5 100644
--- a/src/KOKKOS/pair_table_kokkos.cpp
+++ b/src/KOKKOS/pair_table_kokkos.cpp
@@ -133,19 +133,19 @@ void PairTableKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
   EV_FLOAT ev;
   if (atom->ntypes > MAX_TYPES_STACKPARAMS) {
     if (neighflag == FULL) {
-      PairComputeFunctor<PairTableKokkos<DeviceType>,FULL,false,S_TableCompute<DeviceType,TABSTYLE> >
+      PairComputeFunctor<PairTableKokkos<DeviceType>,FULL,false,0,S_TableCompute<DeviceType,TABSTYLE> >
         ff(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
       else Kokkos::parallel_for(list->inum,ff);
       ff.contribute();
     } else if (neighflag == HALFTHREAD) {
-      PairComputeFunctor<PairTableKokkos<DeviceType>,HALFTHREAD,false,S_TableCompute<DeviceType,TABSTYLE> >
+      PairComputeFunctor<PairTableKokkos<DeviceType>,HALFTHREAD,false,0,S_TableCompute<DeviceType,TABSTYLE> >
         ff(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
       else Kokkos::parallel_for(list->inum,ff);
       ff.contribute();
     } else if (neighflag == HALF) {
-      PairComputeFunctor<PairTableKokkos<DeviceType>,HALF,false,S_TableCompute<DeviceType,TABSTYLE> >
+      PairComputeFunctor<PairTableKokkos<DeviceType>,HALF,false,0,S_TableCompute<DeviceType,TABSTYLE> >
         f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
       else Kokkos::parallel_for(list->inum,f);
@@ -153,19 +153,19 @@ void PairTableKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
     }
   } else {
     if (neighflag == FULL) {
-      PairComputeFunctor<PairTableKokkos<DeviceType>,FULL,true,S_TableCompute<DeviceType,TABSTYLE> >
+      PairComputeFunctor<PairTableKokkos<DeviceType>,FULL,true,0,S_TableCompute<DeviceType,TABSTYLE> >
         f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
       else Kokkos::parallel_for(list->inum,f);
       f.contribute();
     } else if (neighflag == HALFTHREAD) {
-      PairComputeFunctor<PairTableKokkos<DeviceType>,HALFTHREAD,true,S_TableCompute<DeviceType,TABSTYLE> >
+      PairComputeFunctor<PairTableKokkos<DeviceType>,HALFTHREAD,true,0,S_TableCompute<DeviceType,TABSTYLE> >
         f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
       else Kokkos::parallel_for(list->inum,f);
       f.contribute();
     } else if (neighflag == HALF) {
-      PairComputeFunctor<PairTableKokkos<DeviceType>,HALF,true,S_TableCompute<DeviceType,TABSTYLE> >
+      PairComputeFunctor<PairTableKokkos<DeviceType>,HALF,true,0,S_TableCompute<DeviceType,TABSTYLE> >
         f(this,(NeighListKokkos<DeviceType>*) list);
       if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
       else Kokkos::parallel_for(list->inum,f);
diff --git a/src/KOKKOS/pair_table_kokkos.h b/src/KOKKOS/pair_table_kokkos.h
index 80226d3770..18112e4c18 100644
--- a/src/KOKKOS/pair_table_kokkos.h
+++ b/src/KOKKOS/pair_table_kokkos.h
@@ -35,9 +35,6 @@ struct S_TableCompute {
   static constexpr int TabStyle = TABSTYLE;
 };
 
-template <class DeviceType, int NEIGHFLAG, int TABSTYLE>
-struct PairTableComputeFunctor;
-
 template<class DeviceType>
 class PairTableKokkos : public PairTable {
  public:
@@ -135,33 +132,33 @@ class PairTableKokkos : public PairTable {
   F_FLOAT compute_ecoul(const F_FLOAT& /*rsq*/, const int& /*i*/, const int& /*j*/,
                         const int& /*itype*/, const int& /*jtype*/) const { return 0; }
 
-  friend struct PairComputeFunctor<PairTableKokkos,FULL,true,S_TableCompute<DeviceType,LOOKUP> >;
-  friend struct PairComputeFunctor<PairTableKokkos,HALF,true,S_TableCompute<DeviceType,LOOKUP> >;
-  friend struct PairComputeFunctor<PairTableKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,LOOKUP> >;
-  friend struct PairComputeFunctor<PairTableKokkos,FULL,false,S_TableCompute<DeviceType,LOOKUP> >;
-  friend struct PairComputeFunctor<PairTableKokkos,HALF,false,S_TableCompute<DeviceType,LOOKUP> >;
-  friend struct PairComputeFunctor<PairTableKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,LOOKUP> >;
+  friend struct PairComputeFunctor<PairTableKokkos,FULL,true,0,S_TableCompute<DeviceType,LOOKUP> >;
+  friend struct PairComputeFunctor<PairTableKokkos,HALF,true,0,S_TableCompute<DeviceType,LOOKUP> >;
+  friend struct PairComputeFunctor<PairTableKokkos,HALFTHREAD,true,0,S_TableCompute<DeviceType,LOOKUP> >;
+  friend struct PairComputeFunctor<PairTableKokkos,FULL,false,0,S_TableCompute<DeviceType,LOOKUP> >;
+  friend struct PairComputeFunctor<PairTableKokkos,HALF,false,0,S_TableCompute<DeviceType,LOOKUP> >;
+  friend struct PairComputeFunctor<PairTableKokkos,HALFTHREAD,false,0,S_TableCompute<DeviceType,LOOKUP> >;
 
-  friend struct PairComputeFunctor<PairTableKokkos,FULL,true,S_TableCompute<DeviceType,LINEAR> >;
-  friend struct PairComputeFunctor<PairTableKokkos,HALF,true,S_TableCompute<DeviceType,LINEAR> >;
-  friend struct PairComputeFunctor<PairTableKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,LINEAR> >;
-  friend struct PairComputeFunctor<PairTableKokkos,FULL,false,S_TableCompute<DeviceType,LINEAR> >;
-  friend struct PairComputeFunctor<PairTableKokkos,HALF,false,S_TableCompute<DeviceType,LINEAR> >;
-  friend struct PairComputeFunctor<PairTableKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,LINEAR> >;
+  friend struct PairComputeFunctor<PairTableKokkos,FULL,true,0,S_TableCompute<DeviceType,LINEAR> >;
+  friend struct PairComputeFunctor<PairTableKokkos,HALF,true,0,S_TableCompute<DeviceType,LINEAR> >;
+  friend struct PairComputeFunctor<PairTableKokkos,HALFTHREAD,true,0,S_TableCompute<DeviceType,LINEAR> >;
+  friend struct PairComputeFunctor<PairTableKokkos,FULL,false,0,S_TableCompute<DeviceType,LINEAR> >;
+  friend struct PairComputeFunctor<PairTableKokkos,HALF,false,0,S_TableCompute<DeviceType,LINEAR> >;
+  friend struct PairComputeFunctor<PairTableKokkos,HALFTHREAD,false,0,S_TableCompute<DeviceType,LINEAR> >;
 
-  friend struct PairComputeFunctor<PairTableKokkos,FULL,true,S_TableCompute<DeviceType,SPLINE> >;
-  friend struct PairComputeFunctor<PairTableKokkos,HALF,true,S_TableCompute<DeviceType,SPLINE> >;
-  friend struct PairComputeFunctor<PairTableKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,SPLINE> >;
-  friend struct PairComputeFunctor<PairTableKokkos,FULL,false,S_TableCompute<DeviceType,SPLINE> >;
-  friend struct PairComputeFunctor<PairTableKokkos,HALF,false,S_TableCompute<DeviceType,SPLINE> >;
-  friend struct PairComputeFunctor<PairTableKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,SPLINE> >;
+  friend struct PairComputeFunctor<PairTableKokkos,FULL,true,0,S_TableCompute<DeviceType,SPLINE> >;
+  friend struct PairComputeFunctor<PairTableKokkos,HALF,true,0,S_TableCompute<DeviceType,SPLINE> >;
+  friend struct PairComputeFunctor<PairTableKokkos,HALFTHREAD,true,0,S_TableCompute<DeviceType,SPLINE> >;
+  friend struct PairComputeFunctor<PairTableKokkos,FULL,false,0,S_TableCompute<DeviceType,SPLINE> >;
+  friend struct PairComputeFunctor<PairTableKokkos,HALF,false,0,S_TableCompute<DeviceType,SPLINE> >;
+  friend struct PairComputeFunctor<PairTableKokkos,HALFTHREAD,false,0,S_TableCompute<DeviceType,SPLINE> >;
 
-  friend struct PairComputeFunctor<PairTableKokkos,FULL,true,S_TableCompute<DeviceType,BITMAP> >;
-  friend struct PairComputeFunctor<PairTableKokkos,HALF,true,S_TableCompute<DeviceType,BITMAP> >;
-  friend struct PairComputeFunctor<PairTableKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,BITMAP> >;
-  friend struct PairComputeFunctor<PairTableKokkos,FULL,false,S_TableCompute<DeviceType,BITMAP> >;
-  friend struct PairComputeFunctor<PairTableKokkos,HALF,false,S_TableCompute<DeviceType,BITMAP> >;
-  friend struct PairComputeFunctor<PairTableKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,BITMAP> >;
+  friend struct PairComputeFunctor<PairTableKokkos,FULL,true,0,S_TableCompute<DeviceType,BITMAP> >;
+  friend struct PairComputeFunctor<PairTableKokkos,HALF,true,0,S_TableCompute<DeviceType,BITMAP> >;
+  friend struct PairComputeFunctor<PairTableKokkos,HALFTHREAD,true,0,S_TableCompute<DeviceType,BITMAP> >;
+  friend struct PairComputeFunctor<PairTableKokkos,FULL,false,0,S_TableCompute<DeviceType,BITMAP> >;
+  friend struct PairComputeFunctor<PairTableKokkos,HALF,false,0,S_TableCompute<DeviceType,BITMAP> >;
+  friend struct PairComputeFunctor<PairTableKokkos,HALFTHREAD,false,0,S_TableCompute<DeviceType,BITMAP> >;
 
   friend void pair_virial_fdotr_compute<PairTableKokkos>(PairTableKokkos*);
 };
diff --git a/src/KOKKOS/pair_yukawa_colloid_kokkos.cpp b/src/KOKKOS/pair_yukawa_colloid_kokkos.cpp
new file mode 100644
index 0000000000..04eb5ab657
--- /dev/null
+++ b/src/KOKKOS/pair_yukawa_colloid_kokkos.cpp
@@ -0,0 +1,270 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Trung Nguyen (U Chicago)
+------------------------------------------------------------------------- */
+
+#include "pair_yukawa_colloid_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "error.h"
+#include "force.h"
+#include "kokkos.h"
+#include "memory_kokkos.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor.h"
+#include "respa.h"
+#include "update.h"
+
+#include <cmath>
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairYukawaColloidKokkos<DeviceType>::PairYukawaColloidKokkos(LAMMPS *lmp) : PairYukawaColloid(lmp)
+{
+  respa_enable = 0;
+
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK | RADIUS_MASK;
+  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairYukawaColloidKokkos<DeviceType>::~PairYukawaColloidKokkos()
+{
+  if (copymode) return;
+
+  if (allocated) {
+    memoryKK->destroy_kokkos(k_eatom,eatom);
+    memoryKK->destroy_kokkos(k_vatom,vatom);
+    memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairYukawaColloidKokkos<DeviceType>::allocate()
+{
+  PairYukawaColloid::allocate();
+
+  int n = atom->ntypes;
+  memory->destroy(cutsq);
+  memoryKK->create_kokkos(k_cutsq,cutsq,n+1,n+1,"pair:cutsq");
+  d_cutsq = k_cutsq.template view<DeviceType>();
+  k_params = Kokkos::DualView<params_yukawa**,
+                              Kokkos::LayoutRight,DeviceType>(
+                              "PairYukawaColloid::params",n+1,n+1);
+
+  params = k_params.template view<DeviceType>();
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairYukawaColloidKokkos<DeviceType>::init_style()
+{
+  PairYukawaColloid::init_style();
+
+  // error if rRESPA with inner levels
+
+  if (update->whichflag == 1 && utils::strmatch(update->integrate_style,"^respa")) {
+    int respa = 0;
+    if (((Respa *) update->integrate)->level_inner >= 0) respa = 1;
+    if (((Respa *) update->integrate)->level_middle >= 0) respa = 2;
+    if (respa)
+      error->all(FLERR,"Cannot use Kokkos pair style with rRESPA inner/middle");
+  }
+
+  // adjust neighbor list request for KOKKOS
+
+  neighflag = lmp->kokkos->neighflag;
+  auto request = neighbor->find_request(this);
+  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+                           !std::is_same<DeviceType,LMPDeviceType>::value);
+  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+  if (neighflag == FULL) request->enable_full();
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+// Rewrite this.
+template<class DeviceType>
+double PairYukawaColloidKokkos<DeviceType>::init_one(int i, int j)
+{
+  double cutone = PairYukawaColloid::init_one(i,j);
+
+  k_params.h_view(i,j).a      = a[i][j];
+  k_params.h_view(i,j).offset = offset[i][j];
+  k_params.h_view(i,j).cutsq  = cutone*cutone;
+  k_params.h_view(j,i)        = k_params.h_view(i,j);
+
+  if (i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) {
+    m_params[i][j] = m_params[j][i] = k_params.h_view(i,j);
+    m_cutsq[j][i] = m_cutsq[i][j] = cutone*cutone;
+  }
+
+  k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone;
+  k_cutsq.template modify<LMPHostType>();
+  k_params.template modify<LMPHostType>();
+
+  return cutone;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairYukawaColloidKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+
+  if (neighflag == FULL) no_virial_fdotr_compute = 1;
+
+  ev_init(eflag,vflag,0);
+
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    memoryKK->destroy_kokkos(k_eatom,eatom);
+    memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.view<DeviceType>();
+  }
+  if (vflag_atom) {
+    memoryKK->destroy_kokkos(k_vatom,vatom);
+    memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom");
+    d_vatom = k_vatom.view<DeviceType>();
+  }
+
+  atomKK->sync(execution_space,datamask_read);
+  k_cutsq.template sync<DeviceType>();
+  k_params.template sync<DeviceType>();
+  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
+  else atomKK->modified(execution_space,F_MASK);
+
+  x = atomKK->k_x.view<DeviceType>();
+  c_x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  radius = atomKK->k_radius.view<DeviceType>();
+  nlocal = atom->nlocal;
+  nall = atom->nlocal + atom->nghost;
+  newton_pair = force->newton_pair;
+  special_lj[0] = force->special_lj[0];
+  special_lj[1] = force->special_lj[1];
+  special_lj[2] = force->special_lj[2];
+  special_lj[3] = force->special_lj[3];
+
+  // loop over neighbors of my atoms
+
+  EV_FLOAT ev = pair_compute<PairYukawaColloidKokkos<DeviceType>,void >(
+    this,(NeighListKokkos<DeviceType>*)list);
+
+  if (eflag_global) eng_vdwl += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (vflag_fdotr) pair_virial_fdotr_compute(this);
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<bool STACKPARAMS, class Specialisation>
+KOKKOS_INLINE_FUNCTION
+F_FLOAT PairYukawaColloidKokkos<DeviceType>::
+compute_fpair(const F_FLOAT& rsq, const int& i, const int&j,
+              const int& itype, const int& jtype) const {
+  (void) i;
+  (void) j;
+  const F_FLOAT radi   = radius[i];
+  const F_FLOAT radj   = radius[j];
+  const F_FLOAT rr     = sqrt(rsq);
+  // Fetch the params either off the stack or from some mapped memory?
+  const F_FLOAT aa     = STACKPARAMS ? m_params[itype][jtype].a
+                                     : params(itype,jtype).a;
+
+  // U   = a * exp(-kappa*(r-(radi+radj))) / kappa
+  // f   = -dU/dr = a * exp(-kappa*r)
+  // f/r = a * exp(-kappa*r) / r
+  const F_FLOAT rinv = 1.0 / rr;
+  const F_FLOAT screening = exp(-kappa*(rr-(radi+radj)));
+  const F_FLOAT forceyukawa = aa * screening;
+  const F_FLOAT fpair = forceyukawa * rinv;
+
+  return fpair;
+}
+
+template<class DeviceType>
+template<bool STACKPARAMS, class Specialisation>
+KOKKOS_INLINE_FUNCTION
+F_FLOAT PairYukawaColloidKokkos<DeviceType>::
+compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j,
+              const int& itype, const int& jtype) const {
+  (void) i;
+  (void) j;
+  const F_FLOAT radi   = radius[i];
+  const F_FLOAT radj   = radius[j];
+  const F_FLOAT rr     = sqrt(rsq);
+  const F_FLOAT aa     = STACKPARAMS ? m_params[itype][jtype].a
+                                     : params(itype,jtype).a;
+  const F_FLOAT offset = STACKPARAMS ? m_params[itype][jtype].offset
+                                     : params(itype,jtype).offset;
+
+  // U   = a * exp(-kappa*(r-(radi+radj))) / kappa
+  const F_FLOAT rinv = 1.0 / rr;
+  const F_FLOAT screening = exp(-kappa*(rr-(radi+radj)));
+
+  return aa / kappa * screening - offset;
+}
+
+
+namespace LAMMPS_NS {
+template class PairYukawaColloidKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class PairYukawaColloidKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/pair_yukawa_colloid_kokkos.h b/src/KOKKOS/pair_yukawa_colloid_kokkos.h
new file mode 100644
index 0000000000..83ce58e898
--- /dev/null
+++ b/src/KOKKOS/pair_yukawa_colloid_kokkos.h
@@ -0,0 +1,123 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(yukawa/colloid/kk,PairYukawaColloidKokkos<LMPDeviceType>);
+PairStyle(yukawa/colloid/kk/device,PairYukawaColloidKokkos<LMPDeviceType>);
+PairStyle(yukawa/colloid/kk/host,PairYukawaColloidKokkos<LMPHostType>);
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_PAIR_YUKAWA_COLLOID_KOKKOS_H
+#define LMP_PAIR_YUKAWA_COLLOID_KOKKOS_H
+
+#include "pair_kokkos.h"
+#include "pair_yukawa_colloid.h"
+#include "neigh_list_kokkos.h"
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class PairYukawaColloidKokkos : public PairYukawaColloid {
+ public:
+  enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF};
+  enum {COUL_FLAG=0};
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  PairYukawaColloidKokkos(class LAMMPS *);
+  ~PairYukawaColloidKokkos() override;
+
+  void compute(int, int) override;
+  void init_style() override;
+  double init_one(int,int) override;
+
+  struct params_yukawa {
+    KOKKOS_INLINE_FUNCTION
+    params_yukawa() { cutsq=0, a = 0; offset = 0; }
+    KOKKOS_INLINE_FUNCTION
+    params_yukawa(int /*i*/) { cutsq=0, a = 0; offset = 0; }
+    F_FLOAT cutsq, a, offset;
+  };
+
+
+ protected:
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_fpair(const F_FLOAT& rsq, const int& i, const int&j,
+                        const int& itype, const int& jtype) const;
+
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j,
+                        const int& itype, const int& jtype) const;
+
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_ecoul(const F_FLOAT& /*rsq*/, const int& /*i*/, const int& /*j*/,
+                        const int& /*itype*/, const int& /*jtype*/) const { return 0; }
+
+
+  Kokkos::DualView<params_yukawa**,Kokkos::LayoutRight,DeviceType> k_params;
+  typename Kokkos::DualView<params_yukawa**,Kokkos::LayoutRight,DeviceType>::t_dev_const_um params;
+  params_yukawa m_params[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+  F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+  typename AT::t_x_array_randomread x;
+  typename AT::t_x_array c_x;
+  typename AT::t_f_array f;
+  typename AT::t_int_1d_randomread type;
+  typename AT::t_float_1d_randomread radius;
+
+  DAT::tdual_efloat_1d k_eatom;
+  DAT::tdual_virial_array k_vatom;
+  typename AT::t_efloat_1d d_eatom;
+  typename AT::t_virial_array d_vatom;
+
+  int newton_pair;
+  double special_lj[4];
+
+  typename AT::tdual_ffloat_2d k_cutsq;
+  typename AT::t_ffloat_2d d_cutsq;
+
+
+  int neighflag;
+  int nlocal,nall,eflag,vflag;
+
+  void allocate() override;
+  friend struct PairComputeFunctor<PairYukawaColloidKokkos,FULL,true,0>;
+  friend struct PairComputeFunctor<PairYukawaColloidKokkos,FULL,true,1>;
+  friend struct PairComputeFunctor<PairYukawaColloidKokkos,HALF,true>;
+  friend struct PairComputeFunctor<PairYukawaColloidKokkos,HALFTHREAD,true>;
+  friend struct PairComputeFunctor<PairYukawaColloidKokkos,FULL,false,0>;
+  friend struct PairComputeFunctor<PairYukawaColloidKokkos,FULL,false,1>;
+  friend struct PairComputeFunctor<PairYukawaColloidKokkos,HALF,false>;
+  friend struct PairComputeFunctor<PairYukawaColloidKokkos,HALFTHREAD,false>;
+  friend EV_FLOAT pair_compute_neighlist<PairYukawaColloidKokkos,FULL,0>(PairYukawaColloidKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairYukawaColloidKokkos,FULL,1>(PairYukawaColloidKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairYukawaColloidKokkos,HALF>(
+    PairYukawaColloidKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairYukawaColloidKokkos,HALFTHREAD>(
+    PairYukawaColloidKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairYukawaColloidKokkos>(
+    PairYukawaColloidKokkos*,NeighListKokkos<DeviceType>*);
+  friend void pair_virial_fdotr_compute<PairYukawaColloidKokkos>(PairYukawaColloidKokkos*);
+
+};
+
+}
+
+#endif
+#endif
+
diff --git a/src/KOKKOS/pair_yukawa_kokkos.h b/src/KOKKOS/pair_yukawa_kokkos.h
index e04f65264b..dc93e83aea 100644
--- a/src/KOKKOS/pair_yukawa_kokkos.h
+++ b/src/KOKKOS/pair_yukawa_kokkos.h
@@ -95,20 +95,19 @@ class PairYukawaKokkos : public PairYukawa {
   int nlocal,nall,eflag,vflag;
 
   void allocate() override;
-  friend struct PairComputeFunctor<PairYukawaKokkos,FULL,true>;
+  friend struct PairComputeFunctor<PairYukawaKokkos,FULL,true,0>;
+  friend struct PairComputeFunctor<PairYukawaKokkos,FULL,true,1>;
   friend struct PairComputeFunctor<PairYukawaKokkos,HALF,true>;
   friend struct PairComputeFunctor<PairYukawaKokkos,HALFTHREAD,true>;
-  friend struct PairComputeFunctor<PairYukawaKokkos,FULL,false>;
+  friend struct PairComputeFunctor<PairYukawaKokkos,FULL,false,0>;
+  friend struct PairComputeFunctor<PairYukawaKokkos,FULL,false,1>;
   friend struct PairComputeFunctor<PairYukawaKokkos,HALF,false>;
   friend struct PairComputeFunctor<PairYukawaKokkos,HALFTHREAD,false>;
-  friend EV_FLOAT pair_compute_neighlist<PairYukawaKokkos,FULL,void>(
-    PairYukawaKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairYukawaKokkos,HALF,void>(
-    PairYukawaKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairYukawaKokkos,HALFTHREAD,void>(
-    PairYukawaKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairYukawaKokkos,void>(
-    PairYukawaKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairYukawaKokkos,FULL,0>(PairYukawaKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairYukawaKokkos,FULL,1>(PairYukawaKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairYukawaKokkos,HALF>(PairYukawaKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairYukawaKokkos,HALFTHREAD>(PairYukawaKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairYukawaKokkos,void>(PairYukawaKokkos*,NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairYukawaKokkos>(PairYukawaKokkos*);
 
 };
diff --git a/src/KOKKOS/pair_zbl_kokkos.h b/src/KOKKOS/pair_zbl_kokkos.h
index bd33cdb5e0..b7638a25e0 100644
--- a/src/KOKKOS/pair_zbl_kokkos.h
+++ b/src/KOKKOS/pair_zbl_kokkos.h
@@ -89,16 +89,19 @@ class PairZBLKokkos : public PairZBL {
 
   void allocate() override;
 
-  friend struct PairComputeFunctor<PairZBLKokkos,FULL,true>;
+  friend struct PairComputeFunctor<PairZBLKokkos,FULL,true,0>;
+  friend struct PairComputeFunctor<PairZBLKokkos,FULL,true,1>;
   friend struct PairComputeFunctor<PairZBLKokkos,HALF,true>;
   friend struct PairComputeFunctor<PairZBLKokkos,HALFTHREAD,true>;
-  friend struct PairComputeFunctor<PairZBLKokkos,FULL,false>;
+  friend struct PairComputeFunctor<PairZBLKokkos,FULL,false,0>;
+  friend struct PairComputeFunctor<PairZBLKokkos,FULL,false,1>;
   friend struct PairComputeFunctor<PairZBLKokkos,HALF,false>;
   friend struct PairComputeFunctor<PairZBLKokkos,HALFTHREAD,false>;
-  friend EV_FLOAT pair_compute_neighlist<PairZBLKokkos,FULL,void>(PairZBLKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairZBLKokkos,HALF,void>(PairZBLKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute_neighlist<PairZBLKokkos,HALFTHREAD,void>(PairZBLKokkos*,NeighListKokkos<DeviceType>*);
-  friend EV_FLOAT pair_compute<PairZBLKokkos,void>(PairZBLKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairZBLKokkos,FULL,0>(PairZBLKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairZBLKokkos,FULL,1>(PairZBLKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairZBLKokkos,HALF>(PairZBLKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairZBLKokkos,HALFTHREAD>(PairZBLKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairZBLKokkos>(PairZBLKokkos*,NeighListKokkos<DeviceType>*);
   friend void pair_virial_fdotr_compute<PairZBLKokkos>(PairZBLKokkos*);
 };
 
diff --git a/src/MANYBODY/pair_airebo.cpp b/src/MANYBODY/pair_airebo.cpp
index e34283f71c..129b9d2218 100644
--- a/src/MANYBODY/pair_airebo.cpp
+++ b/src/MANYBODY/pair_airebo.cpp
@@ -59,7 +59,6 @@ PairAIREBO::PairAIREBO(LAMMPS *lmp)
   nextra = 3;
   pvector = new double[nextra];
 
-  trim_flag = 0; // workaround
   maxlocal = 0;
   REBO_numneigh = nullptr;
   REBO_firstneigh = nullptr;
diff --git a/src/OPENMP/npair_half_bin_newton_tri_omp.cpp b/src/OPENMP/npair_half_bin_newton_tri_omp.cpp
index 4d93d06d75..47524474ed 100644
--- a/src/OPENMP/npair_half_bin_newton_tri_omp.cpp
+++ b/src/OPENMP/npair_half_bin_newton_tri_omp.cpp
@@ -12,16 +12,18 @@
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
-#include "omp_compat.h"
 #include "npair_half_bin_newton_tri_omp.h"
 #include "npair_omp.h"
-#include "neigh_list.h"
+#include "omp_compat.h"
+
 #include "atom.h"
 #include "atom_vec.h"
-#include "molecule.h"
 #include "domain.h"
-#include "my_page.h"
 #include "error.h"
+#include "force.h"
+#include "molecule.h"
+#include "my_page.h"
+#include "neigh_list.h"
 
 using namespace LAMMPS_NS;
 
@@ -40,6 +42,7 @@ void NPairHalfBinNewtonTriOmp::build(NeighList *list)
   const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
   const int molecular = atom->molecular;
   const int moltemplate = (molecular == Atom::TEMPLATE) ? 1 : 0;
+  const double delta = 0.01 * force->angstrom;
 
   NPAIR_OMP_INIT;
 #if defined(_OPENMP)
@@ -48,12 +51,10 @@ void NPairHalfBinNewtonTriOmp::build(NeighList *list)
   NPAIR_OMP_SETUP(nlocal);
 
   int i,j,k,n,itype,jtype,ibin,which,imol,iatom;
-  tagint tagprev;
+  tagint itag,jtag,tagprev;
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   int *neighptr;
 
-  // loop over each atom, storing neighbors
-
   double **x = atom->x;
   int *type = atom->type;
   int *mask = atom->mask;
@@ -79,6 +80,7 @@ void NPairHalfBinNewtonTriOmp::build(NeighList *list)
     n = 0;
     neighptr = ipage.vget();
 
+    itag = tag[i];
     itype = type[i];
     xtmp = x[i][0];
     ytmp = x[i][1];
@@ -90,20 +92,31 @@ void NPairHalfBinNewtonTriOmp::build(NeighList *list)
     }
 
     // loop over all atoms in bins in stencil
-    // pairs for atoms j "below" i are excluded
-    // below = lower z or (equal z and lower y) or (equal zy and lower x)
-    //         (equal zyx and j <= i)
-    // latter excludes self-self interaction but allows superposed atoms
+    // for triclinic, bin stencil is full in all 3 dims
+    // must use itag/jtag to eliminate half the I/J interactions
+    // cannot use I/J exact coord comparision
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     ibin = atom2bin[i];
     for (k = 0; k < nstencil; k++) {
       for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
-        if (x[j][2] < ztmp) continue;
-        if (x[j][2] == ztmp) {
-          if (x[j][1] < ytmp) continue;
-          if (x[j][1] == ytmp) {
-            if (x[j][0] < xtmp) continue;
-            if (x[j][0] == xtmp && j <= i) continue;
+
+        if (j <= i) continue;
+        if (j >= nlocal) {
+          jtag = tag[j];
+          if (itag > jtag) {
+            if ((itag+jtag) % 2 == 0) continue;
+          } else if (itag < jtag) {
+            if ((itag+jtag) % 2 == 1) continue;
+          } else {
+            if (fabs(x[j][2]-ztmp) > delta) {
+              if (x[j][2] < ztmp) continue;
+            } else if (fabs(x[j][1]-ytmp) > delta) {
+              if (x[j][1] < ytmp) continue;
+            } else {
+              if (x[j][0] < xtmp) continue;
+            }
           }
         }
 
diff --git a/src/OPENMP/npair_half_multi_newton_tri_omp.cpp b/src/OPENMP/npair_half_multi_newton_tri_omp.cpp
index a152d011a7..e26bea990f 100644
--- a/src/OPENMP/npair_half_multi_newton_tri_omp.cpp
+++ b/src/OPENMP/npair_half_multi_newton_tri_omp.cpp
@@ -12,17 +12,19 @@
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
-#include "omp_compat.h"
 #include "npair_half_multi_newton_tri_omp.h"
-#include "npair_omp.h"
-#include "neighbor.h"
-#include "neigh_list.h"
+
 #include "atom.h"
 #include "atom_vec.h"
-#include "molecule.h"
 #include "domain.h"
-#include "my_page.h"
 #include "error.h"
+#include "force.h"
+#include "molecule.h"
+#include "my_page.h"
+#include "neigh_list.h"
+#include "neighbor.h"
+#include "npair_omp.h"
+#include "omp_compat.h"
 
 using namespace LAMMPS_NS;
 
@@ -43,6 +45,7 @@ void NPairHalfMultiNewtonTriOmp::build(NeighList *list)
   const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
   const int molecular = atom->molecular;
   const int moltemplate = (molecular == Atom::TEMPLATE) ? 1 : 0;
+  const double delta = 0.01 * force->angstrom;
 
   NPAIR_OMP_INIT;
 #if defined(_OPENMP)
@@ -51,13 +54,11 @@ void NPairHalfMultiNewtonTriOmp::build(NeighList *list)
   NPAIR_OMP_SETUP(nlocal);
 
   int i,j,k,n,itype,jtype,ibin,jbin,icollection,jcollection,which,ns,imol,iatom;
-  tagint tagprev;
+  tagint itag,jtag,tagprev;
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   int *neighptr,*s;
   int js;
 
-  // loop over each atom, storing neighbors
-
   int *collection = neighbor->collection;
   double **x = atom->x;
   int *type = atom->type;
@@ -84,6 +85,7 @@ void NPairHalfMultiNewtonTriOmp::build(NeighList *list)
     n = 0;
     neighptr = ipage.vget();
 
+    itag = tag[i];
     itype = type[i];
     icollection = collection[i];
     xtmp = x[i][0];
@@ -98,65 +100,80 @@ void NPairHalfMultiNewtonTriOmp::build(NeighList *list)
     ibin = atom2bin[i];
 
     // loop through stencils for all collections
+
     for (jcollection = 0; jcollection < ncollections; jcollection++) {
 
       // if same collection use own bin
+
       if (icollection == jcollection) jbin = ibin;
-          else jbin = coord2bin(x[i], jcollection);
+      else jbin = coord2bin(x[i], jcollection);
 
       // loop over all atoms in bins in stencil
-      // stencil is empty if i larger than j
-      // stencil is half if i same size as j
-      // stencil is full if i smaller than j
-      // if half: pairs for atoms j "below" i are excluded
-      // below = lower z or (equal z and lower y) or (equal zy and lower x)
-      //         (equal zyx and j <= i)
-      // latter excludes self-self interaction but allows superposed atoms
+      // for triclinic:
+      //   stencil is empty if i larger than j
+      //   stencil is full if i smaller than j
+      //   stencil is full if i same size as j
+      // for i smaller than j:
+      //   must use itag/jtag to eliminate half the I/J interactions
+      //   cannot use I/J exact coord comparision
+      //     b/c transforming orthog -> lambda -> orthog for ghost atoms
+      //     with an added PBC offset can shift all 3 coords by epsilon
 
-          s = stencil_multi[icollection][jcollection];
-          ns = nstencil_multi[icollection][jcollection];
+      s = stencil_multi[icollection][jcollection];
+      ns = nstencil_multi[icollection][jcollection];
 
-          for (k = 0; k < ns; k++) {
-            js = binhead_multi[jcollection][jbin + s[k]];
-            for (j = js; j >= 0; j = bins[j]) {
+      for (k = 0; k < ns; k++) {
+        js = binhead_multi[jcollection][jbin + s[k]];
+        for (j = js; j >= 0; j = bins[j]) {
 
-          // if same size (same collection), use half stencil
-          if (cutcollectionsq[icollection][icollection] == cutcollectionsq[jcollection][jcollection]){
-            if (x[j][2] < ztmp) continue;
-            if (x[j][2] == ztmp) {
-              if (x[j][1] < ytmp) continue;
-              if (x[j][1] == ytmp) {
-                if (x[j][0] < xtmp) continue;
-                if (x[j][0] == xtmp && j <= i) continue;
+          // if same size (same collection), exclude half of interactions
+
+          if (cutcollectionsq[icollection][icollection] ==
+              cutcollectionsq[jcollection][jcollection]) {
+            if (j <= i) continue;
+            if (j >= nlocal) {
+              jtag = tag[j];
+              if (itag > jtag) {
+                if ((itag+jtag) % 2 == 0) continue;
+              } else if (itag < jtag) {
+                if ((itag+jtag) % 2 == 1) continue;
+              } else {
+                if (fabs(x[j][2]-ztmp) > delta) {
+                  if (x[j][2] < ztmp) continue;
+                } else if (fabs(x[j][1]-ytmp) > delta) {
+                  if (x[j][1] < ytmp) continue;
+                } else {
+                  if (x[j][0] < xtmp) continue;
+                }
               }
             }
           }
 
           jtype = type[j];
-              if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+          if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
 
-              delx = xtmp - x[j][0];
-              dely = ytmp - x[j][1];
-              delz = ztmp - x[j][2];
-              rsq = delx*delx + dely*dely + delz*delz;
+          delx = xtmp - x[j][0];
+          dely = ytmp - x[j][1];
+          delz = ztmp - x[j][2];
+          rsq = delx*delx + dely*dely + delz*delz;
 
-              if (rsq <= cutneighsq[itype][jtype]) {
-                if (molecular != Atom::ATOMIC) {
-                    if (!moltemplate)
-                      which = find_special(special[i],nspecial[i],tag[j]);
-                    else if (imol >= 0)
-                      which = find_special(onemols[imol]->special[iatom],
-                                       onemols[imol]->nspecial[iatom],
-                                       tag[j]-tagprev);
-                    else which = 0;
-                    if (which == 0) neighptr[n++] = j;
-                    else if (domain->minimum_image_check(delx,dely,delz))
-                      neighptr[n++] = j;
-                    else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
-                } else neighptr[n++] = j;
-              }
-            }
+          if (rsq <= cutneighsq[itype][jtype]) {
+            if (molecular != Atom::ATOMIC) {
+              if (!moltemplate)
+                which = find_special(special[i],nspecial[i],tag[j]);
+              else if (imol >= 0)
+                which = find_special(onemols[imol]->special[iatom],
+                                     onemols[imol]->nspecial[iatom],
+                                     tag[j]-tagprev);
+              else which = 0;
+              if (which == 0) neighptr[n++] = j;
+              else if (domain->minimum_image_check(delx,dely,delz))
+                neighptr[n++] = j;
+              else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+            } else neighptr[n++] = j;
           }
+        }
+      }
     }
 
     ilist[i] = i;
diff --git a/src/OPENMP/npair_half_multi_old_newton_tri_omp.cpp b/src/OPENMP/npair_half_multi_old_newton_tri_omp.cpp
index 1d906b1fa5..38f645abad 100644
--- a/src/OPENMP/npair_half_multi_old_newton_tri_omp.cpp
+++ b/src/OPENMP/npair_half_multi_old_newton_tri_omp.cpp
@@ -15,13 +15,15 @@
 #include "omp_compat.h"
 #include "npair_half_multi_old_newton_tri_omp.h"
 #include "npair_omp.h"
-#include "neigh_list.h"
+
 #include "atom.h"
 #include "atom_vec.h"
-#include "molecule.h"
 #include "domain.h"
-#include "my_page.h"
 #include "error.h"
+#include "force.h"
+#include "molecule.h"
+#include "my_page.h"
+#include "neigh_list.h"
 
 using namespace LAMMPS_NS;
 
@@ -42,6 +44,7 @@ void NPairHalfMultiOldNewtonTriOmp::build(NeighList *list)
   const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
   const int molecular = atom->molecular;
   const int moltemplate = (molecular == Atom::TEMPLATE) ? 1 : 0;
+  const double delta = 0.01 * force->angstrom;
 
   NPAIR_OMP_INIT;
 #if defined(_OPENMP)
@@ -50,13 +53,11 @@ void NPairHalfMultiOldNewtonTriOmp::build(NeighList *list)
   NPAIR_OMP_SETUP(nlocal);
 
   int i,j,k,n,itype,jtype,ibin,which,ns,imol,iatom;
-  tagint tagprev;
+  tagint itag,jtag,tagprev;
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   int *neighptr,*s;
   double *cutsq,*distsq;
 
-  // loop over each atom, storing neighbors
-
   double **x = atom->x;
   int *type = atom->type;
   int *mask = atom->mask;
@@ -82,6 +83,7 @@ void NPairHalfMultiOldNewtonTriOmp::build(NeighList *list)
     n = 0;
     neighptr = ipage.vget();
 
+    itag = tag[i];
     itype = type[i];
     xtmp = x[i][0];
     ytmp = x[i][1];
@@ -92,13 +94,12 @@ void NPairHalfMultiOldNewtonTriOmp::build(NeighList *list)
       tagprev = tag[i] - iatom - 1;
     }
 
-    // loop over all atoms in bins, including self, in stencil
-    // skip if i,j neighbor cutoff is less than bin distance
-    // bins below self are excluded from stencil
-    // pairs for atoms j "below" i are excluded
-    // below = lower z or (equal z and lower y) or (equal zy and lower x)
-    //         (equal zyx and j <= i)
-    // latter excludes self-self interaction but allows superposed atoms
+    // loop over all atoms in bins in stencil
+    // for triclinic, bin stencil is full in all 3 dims
+    // must use itag/jtag to eliminate half the I/J interactions
+    // cannot use I/J exact coord comparision
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     ibin = atom2bin[i];
     s = stencil_multi_old[itype];
@@ -109,12 +110,21 @@ void NPairHalfMultiOldNewtonTriOmp::build(NeighList *list)
       for (j = binhead[ibin+s[k]]; j >= 0; j = bins[j]) {
         jtype = type[j];
         if (cutsq[jtype] < distsq[k]) continue;
-        if (x[j][2] < ztmp) continue;
-        if (x[j][2] == ztmp) {
-          if (x[j][1] < ytmp) continue;
-          if (x[j][1] == ytmp) {
-            if (x[j][0] < xtmp) continue;
-            if (x[j][0] == xtmp && j <= i) continue;
+
+        if (j >= nlocal) {
+          jtag = tag[j];
+          if (itag > jtag) {
+            if ((itag+jtag) % 2 == 0) continue;
+          } else if (itag < jtag) {
+            if ((itag+jtag) % 2 == 1) continue;
+          } else {
+            if (fabs(x[j][2]-ztmp) > delta) {
+              if (x[j][2] < ztmp) continue;
+            } else if (fabs(x[j][1]-ytmp) > delta) {
+              if (x[j][1] < ytmp) continue;
+            } else {
+              if (x[j][0] < xtmp) continue;
+            }
           }
         }
 
diff --git a/src/OPENMP/npair_half_nsq_newton_omp.cpp b/src/OPENMP/npair_half_nsq_newton_omp.cpp
index c010a3b024..42cf63278a 100644
--- a/src/OPENMP/npair_half_nsq_newton_omp.cpp
+++ b/src/OPENMP/npair_half_nsq_newton_omp.cpp
@@ -15,14 +15,16 @@
 #include "omp_compat.h"
 #include "npair_half_nsq_newton_omp.h"
 #include "npair_omp.h"
-#include "neigh_list.h"
+
 #include "atom.h"
 #include "atom_vec.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
 #include "group.h"
 #include "molecule.h"
-#include "domain.h"
 #include "my_page.h"
-#include "error.h"
+#include "neigh_list.h"
 
 using namespace LAMMPS_NS;
 
@@ -42,6 +44,8 @@ void NPairHalfNsqNewtonOmp::build(NeighList *list)
   const int bitmask = (includegroup) ? group->bitmask[includegroup] : 0;
   const int molecular = atom->molecular;
   const int moltemplate = (molecular == Atom::TEMPLATE) ? 1 : 0;
+  const double delta = 0.01 * force->angstrom;
+  const int triclinic = domain->triclinic;
 
   NPAIR_OMP_INIT;
 #if defined(_OPENMP)
@@ -54,8 +58,6 @@ void NPairHalfNsqNewtonOmp::build(NeighList *list)
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   int *neighptr;
 
-  // loop over each atom, storing neighbors
-
   double **x = atom->x;
   int *type = atom->type;
   int *mask = atom->mask;
@@ -95,7 +97,12 @@ void NPairHalfNsqNewtonOmp::build(NeighList *list)
     }
 
     // loop over remaining atoms, owned and ghost
+    // use itag/jtap comparision to eliminate half the interactions
     // itag = jtag is possible for long cutoffs that include images of self
+    // for triclinic, must use delta to eliminate half the I/J interactions
+    // cannot use I/J exact coord comparision as for orthog
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     for (j = i+1; j < nall; j++) {
       if (includegroup && !(mask[j] & bitmask)) continue;
@@ -106,6 +113,14 @@ void NPairHalfNsqNewtonOmp::build(NeighList *list)
           if ((itag+jtag) % 2 == 0) continue;
         } else if (itag < jtag) {
           if ((itag+jtag) % 2 == 1) continue;
+        } else if (triclinic) {
+          if (fabs(x[j][2]-ztmp) > delta) {
+            if (x[j][2] < ztmp) continue;
+          } else if (fabs(x[j][1]-ytmp) > delta) {
+            if (x[j][1] < ytmp) continue;
+          } else {
+            if (x[j][0] < xtmp) continue;
+          }
         } else {
           if (x[j][2] < ztmp) continue;
           if (x[j][2] == ztmp) {
diff --git a/src/OPENMP/npair_half_respa_bin_newton_tri_omp.cpp b/src/OPENMP/npair_half_respa_bin_newton_tri_omp.cpp
index 73f2102dba..78b3abdd66 100644
--- a/src/OPENMP/npair_half_respa_bin_newton_tri_omp.cpp
+++ b/src/OPENMP/npair_half_respa_bin_newton_tri_omp.cpp
@@ -15,13 +15,15 @@
 #include "omp_compat.h"
 #include "npair_half_respa_bin_newton_tri_omp.h"
 #include "npair_omp.h"
-#include "neigh_list.h"
+
 #include "atom.h"
 #include "atom_vec.h"
-#include "molecule.h"
 #include "domain.h"
-#include "my_page.h"
 #include "error.h"
+#include "force.h"
+#include "molecule.h"
+#include "my_page.h"
+#include "neigh_list.h"
 
 using namespace LAMMPS_NS;
 
@@ -42,6 +44,7 @@ void NPairHalfRespaBinNewtonTriOmp::build(NeighList *list)
   const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
   const int molecular = atom->molecular;
   const int moltemplate = (molecular == Atom::TEMPLATE) ? 1 : 0;
+  const double delta = 0.01 * force->angstrom;
 
   NPAIR_OMP_INIT;
 
@@ -53,12 +56,10 @@ void NPairHalfRespaBinNewtonTriOmp::build(NeighList *list)
   NPAIR_OMP_SETUP(nlocal);
 
   int i,j,k,n,itype,jtype,ibin,n_inner,n_middle,imol,iatom;
-  tagint tagprev;
+  tagint itag,jtag,tagprev;
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   int *neighptr,*neighptr_inner,*neighptr_middle;
 
-  // loop over each atom, storing neighbors
-
   double **x = atom->x;
   int *type = atom->type;
   int *mask = atom->mask;
@@ -111,6 +112,7 @@ void NPairHalfRespaBinNewtonTriOmp::build(NeighList *list)
       neighptr_middle = ipage_middle->vget();
     }
 
+    itag = tag[i];
     itype = type[i];
     xtmp = x[i][0];
     ytmp = x[i][1];
@@ -122,20 +124,31 @@ void NPairHalfRespaBinNewtonTriOmp::build(NeighList *list)
     }
 
     // loop over all atoms in bins in stencil
-    // pairs for atoms j "below" i are excluded
-    // below = lower z or (equal z and lower y) or (equal zy and lower x)
-    //         (equal zyx and j <= i)
-    // latter excludes self-self interaction but allows superposed atoms
+    // for triclinic, bin stencil is full in all 3 dims
+    // must use itag/jtag to eliminate half the I/J interactions
+    // cannot use I/J exact coord comparision
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     ibin = atom2bin[i];
     for (k = 0; k < nstencil; k++) {
       for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
-        if (x[j][2] < ztmp) continue;
-        if (x[j][2] == ztmp) {
-          if (x[j][1] < ytmp) continue;
-          if (x[j][1] == ytmp) {
-            if (x[j][0] < xtmp) continue;
-            if (x[j][0] == xtmp && j <= i) continue;
+
+        if (j <= i) continue;
+        if (j >= nlocal) {
+          jtag = tag[j];
+          if (itag > jtag) {
+            if ((itag+jtag) % 2 == 0) continue;
+          } else if (itag < jtag) {
+            if ((itag+jtag) % 2 == 1) continue;
+          } else {
+            if (fabs(x[j][2]-ztmp) > delta) {
+              if (x[j][2] < ztmp) continue;
+            } else if (fabs(x[j][1]-ytmp) > delta) {
+              if (x[j][1] < ytmp) continue;
+            } else {
+              if (x[j][0] < xtmp) continue;
+            }
           }
         }
 
diff --git a/src/OPENMP/npair_half_respa_nsq_newton_omp.cpp b/src/OPENMP/npair_half_respa_nsq_newton_omp.cpp
index 4bcba0fbef..a9745edc64 100644
--- a/src/OPENMP/npair_half_respa_nsq_newton_omp.cpp
+++ b/src/OPENMP/npair_half_respa_nsq_newton_omp.cpp
@@ -15,21 +15,22 @@
 #include "omp_compat.h"
 #include "npair_half_respa_nsq_newton_omp.h"
 #include "npair_omp.h"
-#include "neigh_list.h"
+
 #include "atom.h"
 #include "atom_vec.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
 #include "group.h"
 #include "molecule.h"
-#include "domain.h"
 #include "my_page.h"
-#include "error.h"
+#include "neigh_list.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-NPairHalfRespaNsqNewtonOmp::NPairHalfRespaNsqNewtonOmp(LAMMPS *lmp) :
-  NPair(lmp) {}
+NPairHalfRespaNsqNewtonOmp::NPairHalfRespaNsqNewtonOmp(LAMMPS *lmp) : NPair(lmp) {}
 
 /* ----------------------------------------------------------------------
    multiple respa lists
@@ -45,6 +46,8 @@ void NPairHalfRespaNsqNewtonOmp::build(NeighList *list)
   const int bitmask = (includegroup) ? group->bitmask[includegroup] : 0;
   const int molecular = atom->molecular;
   const int moltemplate = (molecular == Atom::TEMPLATE) ? 1 : 0;
+  const double delta = 0.01 * force->angstrom;
+  const int triclinic = domain->triclinic;
 
   NPAIR_OMP_INIT;
 
@@ -60,8 +63,6 @@ void NPairHalfRespaNsqNewtonOmp::build(NeighList *list)
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   int *neighptr,*neighptr_inner,*neighptr_middle;
 
-  // loop over each atom, storing neighbors
-
   double **x = atom->x;
   int *type = atom->type;
   int *mask = atom->mask;
@@ -128,6 +129,12 @@ void NPairHalfRespaNsqNewtonOmp::build(NeighList *list)
     }
 
     // loop over remaining atoms, owned and ghost
+    // use itag/jtap comparision to eliminate half the interactions
+    // itag = jtag is possible for long cutoffs that include images of self
+    // for triclinic, must use delta to eliminate half the I/J interactions
+    // cannot use I/J exact coord comparision as for orthog
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     for (j = i+1; j < nall; j++) {
       if (includegroup && !(mask[j] & bitmask)) continue;
@@ -138,6 +145,14 @@ void NPairHalfRespaNsqNewtonOmp::build(NeighList *list)
           if ((itag+jtag) % 2 == 0) continue;
         } else if (itag < jtag) {
           if ((itag+jtag) % 2 == 1) continue;
+        } else if (triclinic) {
+          if (fabs(x[j][2]-ztmp) > delta) {
+            if (x[j][2] < ztmp) continue;
+          } else if (fabs(x[j][1]-ytmp) > delta) {
+            if (x[j][1] < ytmp) continue;
+          } else {
+            if (x[j][0] < xtmp) continue;
+          }
         } else {
           if (x[j][2] < ztmp) continue;
           if (x[j][2] == ztmp) {
diff --git a/src/OPENMP/npair_half_size_bin_newton_tri_omp.cpp b/src/OPENMP/npair_half_size_bin_newton_tri_omp.cpp
index 160cf64194..7fcf07e9c8 100644
--- a/src/OPENMP/npair_half_size_bin_newton_tri_omp.cpp
+++ b/src/OPENMP/npair_half_size_bin_newton_tri_omp.cpp
@@ -18,6 +18,7 @@
 #include "atom_vec.h"
 #include "domain.h"
 #include "error.h"
+#include "force.h"
 #include "molecule.h"
 #include "my_page.h"
 #include "neigh_list.h"
@@ -46,6 +47,7 @@ void NPairHalfSizeBinNewtonTriOmp::build(NeighList *list)
   const int moltemplate = (molecular == Atom::TEMPLATE) ? 1 : 0;
   const int history = list->history;
   const int mask_history = 1 << HISTBITS;
+  const double delta = 0.01 * force->angstrom;
 
   NPAIR_OMP_INIT;
 #if defined(_OPENMP)
@@ -54,13 +56,11 @@ void NPairHalfSizeBinNewtonTriOmp::build(NeighList *list)
   NPAIR_OMP_SETUP(nlocal);
 
   int i,j,jh,k,n,ibin,which,imol,iatom;
-  tagint tagprev;
+  tagint itag,jtag,tagprev;
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   double radi,radsum,cutsq;
   int *neighptr;
 
-  // loop over each atom, storing neighbors
-
   double **x = atom->x;
   double *radius = atom->radius;
   int *type = atom->type;
@@ -87,6 +87,7 @@ void NPairHalfSizeBinNewtonTriOmp::build(NeighList *list)
     n = 0;
     neighptr = ipage.vget();
 
+    itag = tag[i];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
@@ -98,20 +99,31 @@ void NPairHalfSizeBinNewtonTriOmp::build(NeighList *list)
     }
 
     // loop over all atoms in bins in stencil
-    // pairs for atoms j "below" i are excluded
-    // below = lower z or (equal z and lower y) or (equal zy and lower x)
-    //         (equal zyx and j <= i)
-    // latter excludes self-self interaction but allows superposed atoms
+    // for triclinic, bin stencil is full in all 3 dims
+    // must use itag/jtag to eliminate half the I/J interactions
+    // cannot use I/J exact coord comparision
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     ibin = atom2bin[i];
     for (k = 0; k < nstencil; k++) {
       for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
-        if (x[j][2] < ztmp) continue;
-        if (x[j][2] == ztmp) {
-          if (x[j][1] < ytmp) continue;
-          if (x[j][1] == ytmp) {
-            if (x[j][0] < xtmp) continue;
-            if (x[j][0] == xtmp && j <= i) continue;
+
+        if (j <= i) continue;
+        if (j >= nlocal) {
+          jtag = tag[j];
+          if (itag > jtag) {
+            if ((itag+jtag) % 2 == 0) continue;
+          } else if (itag < jtag) {
+            if ((itag+jtag) % 2 == 1) continue;
+          } else {
+            if (fabs(x[j][2]-ztmp) > delta) {
+              if (x[j][2] < ztmp) continue;
+            } else if (fabs(x[j][1]-ytmp) > delta) {
+              if (x[j][1] < ytmp) continue;
+            } else {
+              if (x[j][0] < xtmp) continue;
+            }
           }
         }
 
diff --git a/src/OPENMP/npair_half_size_multi_newton_tri_omp.cpp b/src/OPENMP/npair_half_size_multi_newton_tri_omp.cpp
index 73e11a2745..4765c918b7 100644
--- a/src/OPENMP/npair_half_size_multi_newton_tri_omp.cpp
+++ b/src/OPENMP/npair_half_size_multi_newton_tri_omp.cpp
@@ -18,6 +18,7 @@
 #include "atom_vec.h"
 #include "domain.h"
 #include "error.h"
+#include "force.h"
 #include "molecule.h"
 #include "my_page.h"
 #include "neighbor.h"
@@ -48,6 +49,7 @@ void NPairHalfSizeMultiNewtonTriOmp::build(NeighList *list)
   const int moltemplate = (molecular == Atom::TEMPLATE) ? 1 : 0;
   const int history = list->history;
   const int mask_history = 1 << HISTBITS;
+  const double delta = 0.01 * force->angstrom;
 
   NPAIR_OMP_INIT;
 #if defined(_OPENMP)
@@ -55,15 +57,12 @@ void NPairHalfSizeMultiNewtonTriOmp::build(NeighList *list)
 #endif
   NPAIR_OMP_SETUP(nlocal);
 
-  int i,j,jh,k,n,itype,jtype,icollection,jcollection,ibin,jbin,ns;
+  int i,j,jh,k,n,itype,jtype,icollection,jcollection,ibin,jbin,ns,js;
   int which,imol,iatom;
-  tagint tagprev;
+  tagint itag,jtag,tagprev;
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   double radi,radsum,cutdistsq;
   int *neighptr,*s;
-  int js;
-
-  // loop over each atom, storing neighbors
 
   int *collection = neighbor->collection;
   double **x = atom->x;
@@ -92,6 +91,7 @@ void NPairHalfSizeMultiNewtonTriOmp::build(NeighList *list)
     n = 0;
     neighptr = ipage.vget();
 
+    itag = tag[i];
     itype = type[i];
     icollection = collection[i];
     xtmp = x[i][0];
@@ -107,12 +107,13 @@ void NPairHalfSizeMultiNewtonTriOmp::build(NeighList *list)
     ibin = atom2bin[i];
 
     // loop through stencils for all collections
+
     for (jcollection = 0; jcollection < ncollections; jcollection++) {
 
       // if same collection use own bin
-      if(icollection == jcollection) jbin = ibin;
-          else jbin = coord2bin(x[i], jcollection);
 
+      if (icollection == jcollection) jbin = ibin;
+      else jbin = coord2bin(x[i], jcollection);
 
       // loop over all atoms in bins in stencil
       // stencil is empty if i larger than j
@@ -130,14 +131,25 @@ void NPairHalfSizeMultiNewtonTriOmp::build(NeighList *list)
         js = binhead_multi[jcollection][jbin + s[k]];
         for (j = js; j >= 0; j = bins[j]) {
 
-          // if same size (same collection), use half stencil
-          if(cutcollectionsq[icollection][icollection] == cutcollectionsq[jcollection][jcollection]){
-            if (x[j][2] < ztmp) continue;
-            if (x[j][2] == ztmp) {
-              if (x[j][1] < ytmp) continue;
-              if (x[j][1] == ytmp) {
-                if (x[j][0] < xtmp) continue;
-                if (x[j][0] == xtmp && j <= i) continue;
+          // if same size (same collection), exclude half of interactions
+
+          if (cutcollectionsq[icollection][icollection] ==
+              cutcollectionsq[jcollection][jcollection]) {
+            if (j <= i) continue;
+            if (j >= nlocal) {
+              jtag = tag[j];
+              if (itag > jtag) {
+                if ((itag+jtag) % 2 == 0) continue;
+              } else if (itag < jtag) {
+                if ((itag+jtag) % 2 == 1) continue;
+              } else {
+                if (fabs(x[j][2]-ztmp) > delta) {
+                  if (x[j][2] < ztmp) continue;
+                } else if (fabs(x[j][1]-ytmp) > delta) {
+                  if (x[j][1] < ytmp) continue;
+                } else {
+                  if (x[j][0] < xtmp) continue;
+                }
               }
             }
           }
diff --git a/src/OPENMP/npair_half_size_multi_old_newton_tri_omp.cpp b/src/OPENMP/npair_half_size_multi_old_newton_tri_omp.cpp
index caa993ed38..7faa210107 100644
--- a/src/OPENMP/npair_half_size_multi_old_newton_tri_omp.cpp
+++ b/src/OPENMP/npair_half_size_multi_old_newton_tri_omp.cpp
@@ -18,6 +18,7 @@
 #include "atom_vec.h"
 #include "domain.h"
 #include "error.h"
+#include "force.h"
 #include "molecule.h"
 #include "my_page.h"
 #include "neigh_list.h"
@@ -32,7 +33,6 @@ NPairHalfSizeMultiOldNewtonTriOmp::NPairHalfSizeMultiOldNewtonTriOmp(LAMMPS *lmp
   NPair(lmp) {}
 
 /* ----------------------------------------------------------------------
-   size particles
    binned neighbor list construction with Newton's 3rd law for triclinic
    each owned atom i checks its own bin and other bins in triclinic stencil
    multi-type stencil is itype dependent and is distance checked
@@ -46,6 +46,7 @@ void NPairHalfSizeMultiOldNewtonTriOmp::build(NeighList *list)
   const int moltemplate = (molecular == Atom::TEMPLATE) ? 1 : 0;
   const int history = list->history;
   const int mask_history = 1 << HISTBITS;
+  const double delta = 0.01 * force->angstrom;
 
   NPAIR_OMP_INIT;
 #if defined(_OPENMP)
@@ -54,7 +55,7 @@ void NPairHalfSizeMultiOldNewtonTriOmp::build(NeighList *list)
   NPAIR_OMP_SETUP(nlocal);
 
   int i,j,jh,k,n,itype,jtype,ibin,ns,which,imol,iatom;
-  tagint tagprev;
+  tagint itag,jtag,tagprev;
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   double radi,radsum,cutdistsq;
   int *neighptr,*s;
@@ -97,13 +98,12 @@ void NPairHalfSizeMultiOldNewtonTriOmp::build(NeighList *list)
       tagprev = tag[i] - iatom - 1;
     }
 
-    // loop over all atoms in bins, including self, in stencil
-    // skip if i,j neighbor cutoff is less than bin distance
-    // bins below self are excluded from stencil
-    // pairs for atoms j "below" i are excluded
-    // below = lower z or (equal z and lower y) or (equal zy and lower x)
-    //         (equal zyx and j <= i)
-    // latter excludes self-self interaction but allows superposed atoms
+    // loop over all atoms in bins in stencil
+    // for triclinic, bin stencil is full in all 3 dims
+    // must use itag/jtag to eliminate half the I/J interactions
+    // cannot use I/J exact coord comparision
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     ibin = atom2bin[i];
     s = stencil_multi_old[itype];
@@ -114,12 +114,22 @@ void NPairHalfSizeMultiOldNewtonTriOmp::build(NeighList *list)
       for (j = binhead[ibin+s[k]]; j >= 0; j = bins[j]) {
         jtype = type[j];
         if (cutsq[jtype] < distsq[k]) continue;
-        if (x[j][2] < ztmp) continue;
-        if (x[j][2] == ztmp) {
-          if (x[j][1] < ytmp) continue;
-          if (x[j][1] == ytmp) {
-            if (x[j][0] < xtmp) continue;
-            if (x[j][0] == xtmp && j <= i) continue;
+
+        if (j <= i) continue;
+        if (j >= nlocal) {
+          jtag = tag[j];
+          if (itag > jtag) {
+            if ((itag+jtag) % 2 == 0) continue;
+          } else if (itag < jtag) {
+            if ((itag+jtag) % 2 == 1) continue;
+          } else {
+            if (fabs(x[j][2]-ztmp) > delta) {
+              if (x[j][2] < ztmp) continue;
+            } else if (fabs(x[j][1]-ytmp) > delta) {
+              if (x[j][1] < ytmp) continue;
+            } else {
+              if (x[j][0] < xtmp) continue;
+            }
           }
         }
 
diff --git a/src/OPENMP/npair_half_size_nsq_newton_omp.cpp b/src/OPENMP/npair_half_size_nsq_newton_omp.cpp
index 0a80da9422..0628478c0b 100644
--- a/src/OPENMP/npair_half_size_nsq_newton_omp.cpp
+++ b/src/OPENMP/npair_half_size_nsq_newton_omp.cpp
@@ -18,6 +18,7 @@
 #include "atom_vec.h"
 #include "domain.h"
 #include "error.h"
+#include "force.h"
 #include "molecule.h"
 #include "group.h"
 #include "my_page.h"
@@ -30,13 +31,11 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-NPairHalfSizeNsqNewtonOmp::NPairHalfSizeNsqNewtonOmp(LAMMPS *lmp) :
-  NPair(lmp) {}
+NPairHalfSizeNsqNewtonOmp::NPairHalfSizeNsqNewtonOmp(LAMMPS *lmp) : NPair(lmp) {}
 
 /* ----------------------------------------------------------------------
    size particles
    N^2 / 2 search for neighbor pairs with full Newton's 3rd law
-   shear history must be accounted for when a neighbor pair is added
    pair added to list if atoms i and j are both owned and i < j
    if j is ghost only me or other proc adds pair
    decision based on itag,jtag tests
@@ -50,6 +49,8 @@ void NPairHalfSizeNsqNewtonOmp::build(NeighList *list)
   const int moltemplate = (molecular == Atom::TEMPLATE) ? 1 : 0;
   const int history = list->history;
   const int mask_history = 1 << HISTBITS;
+  const double delta = 0.01 * force->angstrom;
+  const int triclinic = domain->triclinic;
 
   NPAIR_OMP_INIT;
 
@@ -104,6 +105,12 @@ void NPairHalfSizeNsqNewtonOmp::build(NeighList *list)
     }
 
     // loop over remaining atoms, owned and ghost
+    // use itag/jtap comparision to eliminate half the interactions
+    // itag = jtag is possible for long cutoffs that include images of self
+    // for triclinic, must use delta to eliminate half the I/J interactions
+    // cannot use I/J exact coord comparision as for orthog
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     for (j = i+1; j < nall; j++) {
       if (includegroup && !(mask[j] & bitmask)) continue;
@@ -114,6 +121,14 @@ void NPairHalfSizeNsqNewtonOmp::build(NeighList *list)
           if ((itag+jtag) % 2 == 0) continue;
         } else if (itag < jtag) {
           if ((itag+jtag) % 2 == 1) continue;
+        } else if (triclinic) {
+          if (fabs(x[j][2]-ztmp) > delta) {
+            if (x[j][2] < ztmp) continue;
+          } else if (fabs(x[j][1]-ytmp) > delta) {
+            if (x[j][1] < ytmp) continue;
+          } else {
+            if (x[j][0] < xtmp) continue;
+          }
         } else {
           if (x[j][2] < ztmp) continue;
           if (x[j][2] == ztmp) {
diff --git a/src/OPENMP/npair_halffull_newton_omp.cpp b/src/OPENMP/npair_halffull_newton_omp.cpp
index abd5f7eacb..e833ab3095 100644
--- a/src/OPENMP/npair_halffull_newton_omp.cpp
+++ b/src/OPENMP/npair_halffull_newton_omp.cpp
@@ -15,7 +15,9 @@
 #include "npair_halffull_newton_omp.h"
 
 #include "atom.h"
+#include "domain.h"
 #include "error.h"
+#include "force.h"
 #include "my_page.h"
 #include "neigh_list.h"
 #include "npair_omp.h"
@@ -38,6 +40,8 @@ NPairHalffullNewtonOmp::NPairHalffullNewtonOmp(LAMMPS *lmp) : NPair(lmp) {}
 void NPairHalffullNewtonOmp::build(NeighList *list)
 {
   const int inum_full = list->listfull->inum;
+  const double delta = 0.01 * force->angstrom;
+  const int triclinic = domain->triclinic;
 
   NPAIR_OMP_INIT;
 #if defined(_OPENMP)
@@ -83,8 +87,17 @@ void NPairHalffullNewtonOmp::build(NeighList *list)
     for (jj = 0; jj < jnum; jj++) {
       joriginal = jlist[jj];
       j = joriginal & NEIGHMASK;
+
       if (j < nlocal) {
         if (i > j) continue;
+      } else if (triclinic) {
+        if (fabs(x[j][2]-ztmp) > delta) {
+          if (x[j][2] < ztmp) continue;
+        } else if (fabs(x[j][1]-ytmp) > delta) {
+          if (x[j][1] < ytmp) continue;
+        } else {
+          if (x[j][0] < xtmp) continue;
+        }
       } else {
         if (x[j][2] < ztmp) continue;
         if (x[j][2] == ztmp) {
diff --git a/src/OPENMP/npair_omp.h b/src/OPENMP/npair_omp.h
index 318fddfd54..7249c59406 100644
--- a/src/OPENMP/npair_omp.h
+++ b/src/OPENMP/npair_omp.h
@@ -32,6 +32,7 @@ namespace LAMMPS_NS {
 // get access to number of threads and per-thread data structures via FixOMP
 #define NPAIR_OMP_INIT                 \
   const int nthreads = comm->nthreads; \
+  omp_set_num_threads(nthreads); \
   const int ifix = modify->find_fix("package_omp")
 
 // get thread id and then assign each thread a fixed chunk of atoms
diff --git a/src/VORONOI/compute_voronoi_atom.cpp b/src/VORONOI/compute_voronoi_atom.cpp
index 28bab271a2..b4f1aa3055 100644
--- a/src/VORONOI/compute_voronoi_atom.cpp
+++ b/src/VORONOI/compute_voronoi_atom.cpp
@@ -111,12 +111,7 @@ ComputeVoronoi::ComputeVoronoi(LAMMPS *lmp, int narg, char **arg) :
       if (iarg + 2 > narg) error->all(FLERR,"Illegal compute voronoi/atom command");
       faces_flag = utils::logical(FLERR,arg[iarg+1],false,lmp);
       iarg += 2;
-    } else if (strcmp(arg[iarg], "peratom") == 0) {
-      if (iarg + 2 > narg) error->all(FLERR,"Illegal compute voronoi/atom command");
-      peratom_flag = utils::logical(FLERR,arg[iarg+1],false,lmp);
-      iarg += 2;
-    }
-    else error->all(FLERR,"Illegal compute voronoi/atom command");
+    } else error->all(FLERR,"Illegal compute voronoi/atom command");
   }
 
   if (occupation && ( surface!=VOROSURF_NONE || maxedge>0 ) )
@@ -394,27 +389,29 @@ void ComputeVoronoi::checkOccupation()
   // clear occupation vector
   memset(occvec, 0, oldnatoms*sizeof(*occvec));
 
-  int i, j, k,
-      nlocal = atom->nlocal,
-      nall = atom->nghost + nlocal;
-  double rx, ry, rz,
-         **x = atom->x;
+  int i, j, k;
+  double rx, ry, rz;
+
+  int nlocal = atom->nlocal;
+  int nall = atom->nghost + nlocal;
+  double **x = atom->x;
 
   // prepare destination buffer for variable evaluation
+
   if (atom->nmax > lmax) {
     memory->destroy(lnext);
     lmax = atom->nmax;
     memory->create(lnext,lmax,"voronoi/atom:lnext");
   }
 
-  // clear lroot
-  for (i=0; i<oldnall; ++i) lroot[i] = -1;
+  // clear lroot and lnext
 
-  // clear lnext
+  for (i=0; i<oldnall; ++i) lroot[i] = -1;
   for (i=0; i<nall; ++i) lnext[i] = -1;
 
   // loop over all local atoms and find out in which of the local first frame voronoi cells the are in
   // (need to loop over ghosts, too, to get correct occupation numbers for the second column)
+
   for (i=0; i<nall; ++i) {
     // again: find_voronoi_cell() should be in the common base class. Why it is not, I don't know. Ask the voro++ author.
     if ((  radstr && con_poly->find_voronoi_cell(x[i][0], x[i][1], x[i][2], rx, ry, rz, k)) ||
@@ -435,6 +432,7 @@ void ComputeVoronoi::checkOccupation()
   }
 
   // MPI sum occupation
+
 #ifdef NOTINPLACE
   memcpy(sendocc, occvec, oldnatoms*sizeof(*occvec));
   MPI_Allreduce(sendocc, occvec, oldnatoms, MPI_INT, MPI_SUM, world);
@@ -443,6 +441,7 @@ void ComputeVoronoi::checkOccupation()
 #endif
 
   // determine the total number of atoms in this atom's currently occupied cell
+
   int c;
   for (i=0; i<oldnall; i++) { // loop over lroot (old voronoi cells)
     // count
@@ -461,11 +460,12 @@ void ComputeVoronoi::checkOccupation()
   }
 
   // cherry pick currently owned atoms
+  // set the new atom count in the atom's first frame voronoi cell
+  // but take into account that new atoms might have been added to
+  // the system, so we can only look up occupancy for tags that are
+  // smaller or equal to the recorded largest tag.
+
   for (i=0; i<nlocal; i++) {
-    // set the new atom count in the atom's first frame voronoi cell
-    // but take into account that new atoms might have been added to
-    // the system, so we can only look up occupancy for tags that are
-    // smaller or equal to the recorded largest tag.
     tagint mytag = atom->tag[i];
     if (mytag > oldmaxtag)
       voro[i][0] = 0;
@@ -479,6 +479,7 @@ void ComputeVoronoi::checkOccupation()
 void ComputeVoronoi::loopCells()
 {
   // invoke voro++ and fetch results for owned atoms in group
+
   voronoicell_neighbor c;
   int i;
   if (faces_flag) nfaces = 0;
diff --git a/src/compute_msd_chunk.cpp b/src/compute_msd_chunk.cpp
index 07234ecfdb..6e7436d6ad 100644
--- a/src/compute_msd_chunk.cpp
+++ b/src/compute_msd_chunk.cpp
@@ -27,8 +27,8 @@ using namespace LAMMPS_NS;
 /* ---------------------------------------------------------------------- */
 
 ComputeMSDChunk::ComputeMSDChunk(LAMMPS *lmp, int narg, char **arg) :
-    ComputeChunk(lmp, narg, arg), id_fix(nullptr), massproc(nullptr), masstotal(nullptr),
-    com(nullptr), comall(nullptr), msd(nullptr)
+    ComputeChunk(lmp, narg, arg), id_fix(nullptr), fix(nullptr), massproc(nullptr),
+    masstotal(nullptr), com(nullptr), comall(nullptr), msd(nullptr)
 {
   if (narg != 4) error->all(FLERR, "Illegal compute msd/chunk command");
 
@@ -196,6 +196,12 @@ void ComputeMSDChunk::compute_array()
 void ComputeMSDChunk::allocate()
 {
   ComputeChunk::allocate();
+  memory->destroy(massproc);
+  memory->destroy(masstotal);
+  memory->destroy(com);
+  memory->destroy(comall);
+  memory->destroy(msd);
+
   memory->create(massproc, nchunk, "msd/chunk:massproc");
   memory->create(masstotal, nchunk, "msd/chunk:masstotal");
   memory->create(com, nchunk, 3, "msd/chunk:com");
diff --git a/src/compute_property_local.cpp b/src/compute_property_local.cpp
index d0523a1bec..87517a3e05 100644
--- a/src/compute_property_local.cpp
+++ b/src/compute_property_local.cpp
@@ -405,6 +405,7 @@ int ComputePropertyLocal::count_pairs(int allflag, int forceflag)
       if (!(mask[j] & groupbit)) continue;
 
       // itag = jtag is possible for long cutoffs that include images of self
+      // do not need triclinic logic here b/c neighbor list itself is correct
 
       if (newton_pair == 0 && j >= nlocal) {
         jtag = tag[j];
diff --git a/src/compute_reduce.cpp b/src/compute_reduce.cpp
index 6b27498eb7..3feabf2ec3 100644
--- a/src/compute_reduce.cpp
+++ b/src/compute_reduce.cpp
@@ -34,9 +34,11 @@ using namespace LAMMPS_NS;
 #define BIG 1.0e20
 
 //----------------------------------------------------------------
+
 void abs_max(void *in, void *inout, int * /*len*/, MPI_Datatype * /*type*/)
 {
   // r is the already reduced value, n is the new value
+
   double n = std::fabs(*(double *) in), r = *(double *) inout;
   double m;
 
@@ -47,9 +49,11 @@ void abs_max(void *in, void *inout, int * /*len*/, MPI_Datatype * /*type*/)
   }
   *(double *) inout = m;
 }
+
 void abs_min(void *in, void *inout, int * /*len*/, MPI_Datatype * /*type*/)
 {
   // r is the already reduced value, n is the new value
+
   double n = std::fabs(*(double *) in), r = *(double *) inout;
   double m;
 
@@ -68,6 +72,7 @@ ComputeReduce::ComputeReduce(LAMMPS *lmp, int narg, char **arg) :
     owner(nullptr), idregion(nullptr), region(nullptr), varatom(nullptr)
 {
   int iarg = 0;
+
   if (strcmp(style, "reduce") == 0) {
     if (narg < 5) utils::missing_cmd_args(FLERR, "compute reduce", error);
     iarg = 3;
@@ -134,7 +139,6 @@ ComputeReduce::ComputeReduce(LAMMPS *lmp, int narg, char **arg) :
     value_t val;
 
     val.id = "";
-    val.flavor = 0;
     val.val.c = nullptr;
 
     if (strcmp(arg[iarg], "x") == 0) {
@@ -188,6 +192,7 @@ ComputeReduce::ComputeReduce(LAMMPS *lmp, int narg, char **arg) :
   nvalues = values.size();
   replace = new int[nvalues];
   for (int i = 0; i < nvalues; ++i) replace[i] = -1;
+  input_mode = PERATOM;
   std::string mycmd = "compute ";
   mycmd += style;
 
@@ -207,6 +212,11 @@ ComputeReduce::ComputeReduce(LAMMPS *lmp, int narg, char **arg) :
         error->all(FLERR, "Compute {} replace column already used for another replacement");
       replace[col1] = col2;
       iarg += 2;
+    } else if (strcmp(arg[iarg], "inputs") == 0) {
+      if (iarg + 2 > narg) utils::missing_cmd_args(FLERR, mycmd + " inputs", error);
+      if (strcmp(arg[iarg+1], "peratom") == 0) input_mode = PERATOM;
+      else if (strcmp(arg[iarg+1], "local") == 0) input_mode = LOCAL;
+      iarg += 2;
     } else
       error->all(FLERR, "Unknown compute {} keyword: {}", style, arg[iarg]);
   }
@@ -231,66 +241,67 @@ ComputeReduce::ComputeReduce(LAMMPS *lmp, int narg, char **arg) :
   // setup and error check
 
   for (auto &val : values) {
-    if (val.which == ArgInfo::X || val.which == ArgInfo::V || val.which == ArgInfo::F)
-      val.flavor = PERATOM;
+    if (val.which == ArgInfo::X || val.which == ArgInfo::V || val.which == ArgInfo::F) {
+      if (input_mode == LOCAL) error->all(FLERR,"Compute {} inputs must be all local");
 
-    else if (val.which == ArgInfo::COMPUTE) {
+    } else if (val.which == ArgInfo::COMPUTE) {
       val.val.c = modify->get_compute_by_id(val.id);
       if (!val.val.c)
         error->all(FLERR, "Compute ID {} for compute {} does not exist", val.id, style);
-      if (val.val.c->peratom_flag) {
-        val.flavor = PERATOM;
+
+      if (input_mode == PERATOM) {
+        if (!val.val.c->peratom_flag)
+          error->all(FLERR, "Compute {} compute {} does not calculate per-atom values", style, val.id);
         if (val.argindex == 0 && val.val.c->size_peratom_cols != 0)
-          error->all(FLERR, "Compute {} compute {} does not calculate a per-atom vector", style,
-                     val.id);
+          error->all(FLERR, "Compute {} compute {} does not calculate a per-atom vector", style, val.id);
         if (val.argindex && val.val.c->size_peratom_cols == 0)
-          error->all(FLERR, "Compute {} compute {} does not calculate a per-atom array", style,
-                     val.id);
+          error->all(FLERR, "Compute {} compute {} does not calculate a per-atom array", style, val.id);
         if (val.argindex && val.argindex > val.val.c->size_peratom_cols)
           error->all(FLERR, "Compute {} compute {} array is accessed out-of-range", style, val.id);
-      } else if (val.val.c->local_flag) {
-        val.flavor = LOCAL;
+
+      } else if (input_mode == LOCAL) {
+        if (!val.val.c->peratom_flag)
+          error->all(FLERR, "Compute {} compute {} does not calculate local values", style, val.id);
         if (val.argindex == 0 && val.val.c->size_local_cols != 0)
-          error->all(FLERR, "Compute {} compute {} does not calculate a local vector", style,
-                     val.id);
+          error->all(FLERR, "Compute {} compute {} does not calculate a local vector", style, val.id);
         if (val.argindex && val.val.c->size_local_cols == 0)
-          error->all(FLERR, "Compute {} compute {} does not calculate a local array", style,
-                     val.id);
+          error->all(FLERR, "Compute {} compute {} does not calculate a local array", style, val.id);
         if (val.argindex && val.argindex > val.val.c->size_local_cols)
           error->all(FLERR, "Compute {} compute {} array is accessed out-of-range", style, val.id);
-      } else
-        error->all(FLERR, "Compute {} compute {} calculates global values", style, val.id);
+      }
 
     } else if (val.which == ArgInfo::FIX) {
       val.val.f = modify->get_fix_by_id(val.id);
       if (!val.val.f) error->all(FLERR, "Fix ID {} for compute {} does not exist", val.id, style);
-      if (val.val.f->peratom_flag) {
-        val.flavor = PERATOM;
+
+      if (input_mode == PERATOM) {
+        if (!val.val.f->peratom_flag)
+          error->all(FLERR, "Compute {} fix {} does not calculate per-atom values", style, val.id);
         if (val.argindex == 0 && (val.val.f->size_peratom_cols != 0))
-          error->all(FLERR, "Compute {} fix {} does not calculate a per-atom vector", style,
-                     val.id);
+          error->all(FLERR, "Compute {} fix {} does not calculate a per-atom vector", style, val.id);
         if (val.argindex && (val.val.f->size_peratom_cols == 0))
           error->all(FLERR, "Compute {} fix {} does not calculate a per-atom array", style, val.id);
         if (val.argindex && (val.argindex > val.val.f->size_peratom_cols))
           error->all(FLERR, "Compute {} fix {} array is accessed out-of-range", style, val.id);
-      } else if (val.val.f->local_flag) {
-        val.flavor = LOCAL;
+
+      } else if (input_mode == LOCAL) {
+        if (!val.val.f->local_flag)
+          error->all(FLERR, "Compute {} fix {} does not calculate local values", style, val.id);
         if (val.argindex == 0 && (val.val.f->size_local_cols != 0))
           error->all(FLERR, "Compute {} fix {} does not calculate a local vector", style, val.id);
         if (val.argindex && (val.val.f->size_local_cols == 0))
           error->all(FLERR, "Compute {} fix {} does not calculate a local array", style, val.id);
         if (val.argindex && (val.argindex > val.val.f->size_local_cols))
           error->all(FLERR, "Compute {} fix {} array is accessed out-of-range", style, val.id);
-      } else
-        error->all(FLERR, "Compute {} fix {} calculates global values", style, val.id);
+      }
 
     } else if (val.which == ArgInfo::VARIABLE) {
+      if (input_mode == LOCAL) error->all(FLERR,"Compute {} inputs must be all local");
       val.val.v = input->variable->find(val.id.c_str());
       if (val.val.v < 0)
         error->all(FLERR, "Variable name {} for compute {} does not exist", val.id, style);
       if (input->variable->atomstyle(val.val.v) == 0)
         error->all(FLERR, "Compute {} variable {} is not atom-style variable", style, val.id);
-      val.flavor = PERATOM;
     }
   }
 
@@ -512,7 +523,7 @@ double ComputeReduce::compute_one(int m, int flag)
 
   } else if (val.which == ArgInfo::COMPUTE) {
 
-    if (val.flavor == PERATOM) {
+    if (input_mode == PERATOM) {
       if (!(val.val.c->invoked_flag & Compute::INVOKED_PERATOM)) {
         val.val.c->compute_peratom();
         val.val.c->invoked_flag |= Compute::INVOKED_PERATOM;
@@ -537,7 +548,7 @@ double ComputeReduce::compute_one(int m, int flag)
           one = carray_atom[flag][aidxm1];
       }
 
-    } else if (val.flavor == LOCAL) {
+    } else if (input_mode == LOCAL) {
       if (!(val.val.c->invoked_flag & Compute::INVOKED_LOCAL)) {
         val.val.c->compute_local();
         val.val.c->invoked_flag |= Compute::INVOKED_LOCAL;
@@ -567,7 +578,7 @@ double ComputeReduce::compute_one(int m, int flag)
     if (update->ntimestep % val.val.f->peratom_freq)
       error->all(FLERR, "Fix {} used in compute {} not computed at compatible time", val.id, style);
 
-    if (val.flavor == PERATOM) {
+    if (input_mode == PERATOM) {
       if (aidx == 0) {
         double *fix_vector = val.val.f->vector_atom;
         if (flag < 0) {
@@ -585,7 +596,7 @@ double ComputeReduce::compute_one(int m, int flag)
           one = fix_array[flag][aidxm1];
       }
 
-    } else if (val.flavor == LOCAL) {
+    } else if (input_mode == LOCAL) {
       if (aidx == 0) {
         double *fix_vector = val.val.f->vector_local;
         int n = val.val.f->size_local_rows;
@@ -632,18 +643,18 @@ bigint ComputeReduce::count(int m)
   if ((val.which == ArgInfo::X) || (val.which == ArgInfo::V) || (val.which == ArgInfo::F))
     return group->count(igroup);
   else if (val.which == ArgInfo::COMPUTE) {
-    if (val.flavor == PERATOM) {
+    if (input_mode == PERATOM) {
       return group->count(igroup);
-    } else if (val.flavor == LOCAL) {
+    } else if (input_mode == LOCAL) {
       bigint ncount = val.val.c->size_local_rows;
       bigint ncountall;
       MPI_Allreduce(&ncount, &ncountall, 1, MPI_LMP_BIGINT, MPI_SUM, world);
       return ncountall;
     }
   } else if (val.which == ArgInfo::FIX) {
-    if (val.flavor == PERATOM) {
+    if (input_mode == PERATOM) {
       return group->count(igroup);
-    } else if (val.flavor == LOCAL) {
+    } else if (input_mode == LOCAL) {
       bigint ncount = val.val.f->size_local_rows;
       bigint ncountall;
       MPI_Allreduce(&ncount, &ncountall, 1, MPI_LMP_BIGINT, MPI_SUM, world);
diff --git a/src/compute_reduce.h b/src/compute_reduce.h
index f8f73cb17a..f8b652e00c 100644
--- a/src/compute_reduce.h
+++ b/src/compute_reduce.h
@@ -37,12 +37,11 @@ class ComputeReduce : public Compute {
   double memory_usage() override;
 
  protected:
-  int mode, nvalues;
+  int mode, nvalues, input_mode;
   struct value_t {
     int which;
     int argindex;
     std::string id;
-    int flavor;
     union {
       class Compute *c;
       class Fix *f;
diff --git a/src/compute_reduce_region.cpp b/src/compute_reduce_region.cpp
index efce00ff66..bd850e902c 100644
--- a/src/compute_reduce_region.cpp
+++ b/src/compute_reduce_region.cpp
@@ -33,13 +33,15 @@ static constexpr double BIG = 1.0e20;
 ComputeReduceRegion::ComputeReduceRegion(LAMMPS *lmp, int narg, char **arg) :
     ComputeReduce(lmp, narg, arg)
 {
+  if (input_mode == LOCAL)
+    error->all(FLERR,"Compute reduce/region cannot use local data as input");
 }
 
 /* ----------------------------------------------------------------------
    calculate reduced value for one input M and return it
    if flag = -1:
      sum/min/max/ave all values in vector
-     for per-atom quantities, limit to atoms in group and region
+     limit to atoms in group and region
      if mode = MIN or MAX, also set index to which vector value wins
    if flag >= 0: simply return vector[flag]
 ------------------------------------------------------------------------- */
@@ -57,6 +59,7 @@ double ComputeReduceRegion::compute_one(int m, int flag)
 
   // initialization in case it has not yet been run, e.g. when
   // the compute was invoked right after it has been created
+
   if ((val.which == ArgInfo::COMPUTE) || (val.which == ArgInfo::FIX)) {
     if (val.val.c == nullptr) init();
   }
@@ -97,52 +100,29 @@ double ComputeReduceRegion::compute_one(int m, int flag)
     // invoke compute if not previously invoked
 
   } else if (val.which == ArgInfo::COMPUTE) {
-    if (val.flavor == PERATOM) {
-      if (!(val.val.c->invoked_flag & Compute::INVOKED_PERATOM)) {
-        val.val.c->compute_peratom();
-        val.val.c->invoked_flag |= Compute::INVOKED_PERATOM;
-      }
 
-      if (aidx == 0) {
-        double *compute_vector = val.val.c->vector_atom;
-        if (flag < 0) {
-          for (int i = 0; i < nlocal; i++)
-            if (mask[i] & groupbit && region->match(x[i][0], x[i][1], x[i][2]))
-              combine(one, compute_vector[i], i);
-        } else
-          one = compute_vector[flag];
-      } else {
-        double **compute_array = val.val.c->array_atom;
-        int aidxm1 = aidx - 1;
-        if (flag < 0) {
-          for (int i = 0; i < nlocal; i++)
-            if (mask[i] & groupbit && region->match(x[i][0], x[i][1], x[i][2]))
-              combine(one, compute_array[i][aidxm1], i);
-        } else
-          one = compute_array[flag][aidxm1];
-      }
+    if (!(val.val.c->invoked_flag & Compute::INVOKED_PERATOM)) {
+      val.val.c->compute_peratom();
+      val.val.c->invoked_flag |= Compute::INVOKED_PERATOM;
+    }
 
-    } else if (val.flavor == LOCAL) {
-      if (!(val.val.c->invoked_flag & Compute::INVOKED_LOCAL)) {
-        val.val.c->compute_local();
-        val.val.c->invoked_flag |= Compute::INVOKED_LOCAL;
-      }
-
-      if (aidx == 0) {
-        double *compute_vector = val.val.c->vector_local;
-        if (flag < 0)
-          for (int i = 0; i < val.val.c->size_local_rows; i++) combine(one, compute_vector[i], i);
-        else
-          one = compute_vector[flag];
-      } else {
-        double **compute_array = val.val.c->array_local;
-        int aidxm1 = aidx - 1;
-        if (flag < 0)
-          for (int i = 0; i < val.val.c->size_local_rows; i++)
+    if (aidx == 0) {
+      double *compute_vector = val.val.c->vector_atom;
+      if (flag < 0) {
+        for (int i = 0; i < nlocal; i++)
+          if (mask[i] & groupbit && region->match(x[i][0], x[i][1], x[i][2]))
+            combine(one, compute_vector[i], i);
+      } else
+        one = compute_vector[flag];
+    } else {
+      double **compute_array = val.val.c->array_atom;
+      int aidxm1 = aidx - 1;
+      if (flag < 0) {
+        for (int i = 0; i < nlocal; i++)
+          if (mask[i] & groupbit && region->match(x[i][0], x[i][1], x[i][2]))
             combine(one, compute_array[i][aidxm1], i);
-        else
-          one = compute_array[flag][aidxm1];
-      }
+      } else
+        one = compute_array[flag][aidxm1];
     }
 
     // check if fix frequency is a match
@@ -151,45 +131,26 @@ double ComputeReduceRegion::compute_one(int m, int flag)
     if (update->ntimestep % val.val.f->peratom_freq)
       error->all(FLERR, "Fix {} used in compute {} not computed at compatible time", val.id, style);
 
-    if (val.flavor == PERATOM) {
-      if (aidx == 0) {
-        double *fix_vector = val.val.f->vector_atom;
-        if (flag < 0) {
-          for (int i = 0; i < nlocal; i++)
-            if (mask[i] & groupbit && region->match(x[i][0], x[i][1], x[i][2]))
-              combine(one, fix_vector[i], i);
-        } else
-          one = fix_vector[flag];
-      } else {
-        double **fix_array = val.val.f->array_atom;
-        int aidxm1 = aidx - 1;
-        if (flag < 0) {
-          for (int i = 0; i < nlocal; i++)
-            if (mask[i] & groupbit && region->match(x[i][0], x[i][1], x[i][2]))
-              combine(one, fix_array[i][aidxm1], i);
-        } else
-          one = fix_array[flag][aidxm1];
-      }
-
-    } else if (val.flavor == LOCAL) {
-      if (aidx == 0) {
-        double *fix_vector = val.val.f->vector_local;
-        if (flag < 0)
-          for (int i = 0; i < val.val.f->size_local_rows; i++) combine(one, fix_vector[i], i);
-        else
-          one = fix_vector[flag];
-      } else {
-        double **fix_array = val.val.f->array_local;
-        int aidxm1 = aidx - 1;
-        if (flag < 0)
-          for (int i = 0; i < val.val.f->size_local_rows; i++)
+    if (aidx == 0) {
+      double *fix_vector = val.val.f->vector_atom;
+      if (flag < 0) {
+        for (int i = 0; i < nlocal; i++)
+          if (mask[i] & groupbit && region->match(x[i][0], x[i][1], x[i][2]))
+            combine(one, fix_vector[i], i);
+      } else
+        one = fix_vector[flag];
+    } else {
+      double **fix_array = val.val.f->array_atom;
+      int aidxm1 = aidx - 1;
+      if (flag < 0) {
+        for (int i = 0; i < nlocal; i++)
+          if (mask[i] & groupbit && region->match(x[i][0], x[i][1], x[i][2]))
             combine(one, fix_array[i][aidxm1], i);
-        else
-          one = fix_array[flag][aidxm1];
-      }
+      } else
+        one = fix_array[flag][aidxm1];
     }
 
-    // evaluate atom-style variable
+  // evaluate atom-style variable
 
   } else if (val.which == ArgInfo::VARIABLE) {
     if (atom->nmax > maxatom) {
@@ -218,25 +179,11 @@ bigint ComputeReduceRegion::count(int m)
 
   if (val.which == ArgInfo::X || val.which == ArgInfo::V || val.which == ArgInfo::F)
     return group->count(igroup, region);
-  else if (val.which == ArgInfo::COMPUTE) {
-    if (val.flavor == PERATOM) {
-      return group->count(igroup, region);
-    } else if (val.flavor == LOCAL) {
-      bigint ncount = val.val.c->size_local_rows;
-      bigint ncountall;
-      MPI_Allreduce(&ncount, &ncountall, 1, MPI_DOUBLE, MPI_SUM, world);
-      return ncountall;
-    }
-  } else if (val.which == ArgInfo::FIX) {
-    if (val.flavor == PERATOM) {
-      return group->count(igroup, region);
-    } else if (val.flavor == LOCAL) {
-      bigint ncount = val.val.f->size_local_rows;
-      bigint ncountall;
-      MPI_Allreduce(&ncount, &ncountall, 1, MPI_DOUBLE, MPI_SUM, world);
-      return ncountall;
-    }
-  } else if (val.which == ArgInfo::VARIABLE)
+  else if (val.which == ArgInfo::COMPUTE)
+    return group->count(igroup, region);
+  else if (val.which == ArgInfo::FIX)
+    return group->count(igroup, region);
+  else if (val.which == ArgInfo::VARIABLE)
     return group->count(igroup, region);
 
   bigint dummy = 0;
diff --git a/src/fix_ave_histo.cpp b/src/fix_ave_histo.cpp
index 0a2975bb2e..4503ad56f4 100644
--- a/src/fix_ave_histo.cpp
+++ b/src/fix_ave_histo.cpp
@@ -164,7 +164,7 @@ FixAveHisto::FixAveHisto(LAMMPS *lmp, int narg, char **arg) :
   }
 
   // check input args for kind consistency
-  // all inputs must all be global, per-atom, or local
+  // inputs must all be all either global, per-atom, or local
 
   if (nevery <= 0)
     error->all(FLERR,"Illegal {} nevery value: {}", mycmd, nevery);
diff --git a/src/fix_efield.cpp b/src/fix_efield.cpp
index 9132904b80..236395093c 100644
--- a/src/fix_efield.cpp
+++ b/src/fix_efield.cpp
@@ -129,6 +129,8 @@ FixEfield::FixEfield(LAMMPS *lmp, int narg, char **arg) :
 
 FixEfield::~FixEfield()
 {
+  if (copymode) return;
+
   delete[] xstr;
   delete[] ystr;
   delete[] zstr;
diff --git a/src/fix_property_atom.cpp b/src/fix_property_atom.cpp
index 994b4f0f19..3a53110839 100644
--- a/src/fix_property_atom.cpp
+++ b/src/fix_property_atom.cpp
@@ -198,16 +198,24 @@ FixPropertyAtom::FixPropertyAtom(LAMMPS *lmp, int narg, char **arg) :
 
   astyle = utils::strdup(atom->atom_style);
 
-  // perform initial allocation of atom-based array
   // register with Atom class
 
-  nmax_old = 0;
-  if (!lmp->kokkos) FixPropertyAtom::grow_arrays(atom->nmax);
   atom->add_callback(Atom::GROW);
   atom->add_callback(Atom::RESTART);
   if (border) atom->add_callback(Atom::BORDER);
 }
 
+
+/* ---------------------------------------------------------------------- */
+
+void FixPropertyAtom::post_constructor()
+{
+  // perform initial allocation of atom-based array
+
+  nmax_old = 0;
+  grow_arrays(atom->nmax);
+}
+
 /* ---------------------------------------------------------------------- */
 
 FixPropertyAtom::~FixPropertyAtom()
diff --git a/src/fix_property_atom.h b/src/fix_property_atom.h
index 92497d6188..c50b6049dc 100644
--- a/src/fix_property_atom.h
+++ b/src/fix_property_atom.h
@@ -27,6 +27,7 @@ namespace LAMMPS_NS {
 class FixPropertyAtom : public Fix {
  public:
   FixPropertyAtom(class LAMMPS *, int, char **);
+  void post_constructor() override;
   ~FixPropertyAtom() override;
   int setmask() override;
   void init() override;
diff --git a/src/fix_spring_self.cpp b/src/fix_spring_self.cpp
index 550b3afc4d..df00a2ba8c 100644
--- a/src/fix_spring_self.cpp
+++ b/src/fix_spring_self.cpp
@@ -96,6 +96,8 @@ FixSpringSelf::FixSpringSelf(LAMMPS *lmp, int narg, char **arg) :
 
 FixSpringSelf::~FixSpringSelf()
 {
+  if (copymode) return;
+
   // unregister callbacks to this fix from Atom class
 
   atom->delete_callback(id,Atom::GROW);
diff --git a/src/fix_spring_self.h b/src/fix_spring_self.h
index 59dba78e43..f13f2be918 100644
--- a/src/fix_spring_self.h
+++ b/src/fix_spring_self.h
@@ -47,7 +47,7 @@ class FixSpringSelf : public Fix {
   int size_restart(int) override;
   int maxsize_restart() override;
 
- private:
+ protected:
   double k, espring;
   double **xoriginal;    // original coords of atoms
   int xflag, yflag, zflag;
diff --git a/src/neighbor.cpp b/src/neighbor.cpp
index df1547e5eb..c6eea7e2f1 100644
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@@ -313,7 +313,10 @@ void Neighbor::init()
   triclinic = domain->triclinic;
   newton_pair = force->newton_pair;
 
-  // error check
+  // error checks
+
+  if (triclinic && atom->tag_enable == 0)
+    error->all(FLERR, "Cannot build triclinic neighbor lists unless atoms have IDs");
 
   if (delay > 0 && (delay % every) != 0)
     error->all(FLERR,"Neighbor delay must be 0 or multiple of every setting");
diff --git a/src/npair_half_bin_newton_tri.cpp b/src/npair_half_bin_newton_tri.cpp
index 88ef993a41..d261363b0e 100644
--- a/src/npair_half_bin_newton_tri.cpp
+++ b/src/npair_half_bin_newton_tri.cpp
@@ -13,13 +13,15 @@
 ------------------------------------------------------------------------- */
 
 #include "npair_half_bin_newton_tri.h"
-#include "neigh_list.h"
+
 #include "atom.h"
 #include "atom_vec.h"
-#include "molecule.h"
 #include "domain.h"
-#include "my_page.h"
 #include "error.h"
+#include "force.h"
+#include "molecule.h"
+#include "my_page.h"
+#include "neigh_list.h"
 
 using namespace LAMMPS_NS;
 
@@ -36,10 +38,12 @@ NPairHalfBinNewtonTri::NPairHalfBinNewtonTri(LAMMPS *lmp) : NPair(lmp) {}
 void NPairHalfBinNewtonTri::build(NeighList *list)
 {
   int i,j,k,n,itype,jtype,ibin,which,imol,iatom,moltemplate;
-  tagint tagprev;
+  tagint itag,jtag,tagprev;
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   int *neighptr;
 
+  const double delta = 0.01 * force->angstrom;
+
   double **x = atom->x;
   int *type = atom->type;
   int *mask = atom->mask;
@@ -68,6 +72,7 @@ void NPairHalfBinNewtonTri::build(NeighList *list)
     n = 0;
     neighptr = ipage->vget();
 
+    itag = tag[i];
     itype = type[i];
     xtmp = x[i][0];
     ytmp = x[i][1];
@@ -79,20 +84,31 @@ void NPairHalfBinNewtonTri::build(NeighList *list)
     }
 
     // loop over all atoms in bins in stencil
-    // pairs for atoms j "below" i are excluded
-    // below = lower z or (equal z and lower y) or (equal zy and lower x)
-    //         (equal zyx and j <= i)
-    // latter excludes self-self interaction but allows superposed atoms
+    // for triclinic, bin stencil is full in all 3 dims
+    // must use itag/jtag to eliminate half the I/J interactions
+    // cannot use I/J exact coord comparision
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     ibin = atom2bin[i];
     for (k = 0; k < nstencil; k++) {
       for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
-        if (x[j][2] < ztmp) continue;
-        if (x[j][2] == ztmp) {
-          if (x[j][1] < ytmp) continue;
-          if (x[j][1] == ytmp) {
-            if (x[j][0] < xtmp) continue;
-            if (x[j][0] == xtmp && j <= i) continue;
+
+        if (j <= i) continue;
+        if (j >= nlocal) {
+          jtag = tag[j];
+          if (itag > jtag) {
+            if ((itag+jtag) % 2 == 0) continue;
+          } else if (itag < jtag) {
+            if ((itag+jtag) % 2 == 1) continue;
+          } else {
+            if (fabs(x[j][2]-ztmp) > delta) {
+              if (x[j][2] < ztmp) continue;
+            } else if (fabs(x[j][1]-ytmp) > delta) {
+              if (x[j][1] < ytmp) continue;
+            } else {
+              if (x[j][0] < xtmp) continue;
+            }
           }
         }
 
diff --git a/src/npair_half_multi_newton_tri.cpp b/src/npair_half_multi_newton_tri.cpp
index 9bebfe71e2..24300f6929 100644
--- a/src/npair_half_multi_newton_tri.cpp
+++ b/src/npair_half_multi_newton_tri.cpp
@@ -18,10 +18,11 @@
 #include "atom_vec.h"
 #include "domain.h"
 #include "error.h"
+#include "force.h"
 #include "molecule.h"
 #include "my_page.h"
-#include "neighbor.h"
 #include "neigh_list.h"
+#include "neighbor.h"
 
 using namespace LAMMPS_NS;
 
@@ -38,12 +39,14 @@ NPairHalfMultiNewtonTri::NPairHalfMultiNewtonTri(LAMMPS *lmp) : NPair(lmp) {}
 
 void NPairHalfMultiNewtonTri::build(NeighList *list)
 {
-  int i,j,k,n,itype,jtype,icollection,jcollection,ibin,jbin,which,ns,imol,iatom,moltemplate;
-  tagint tagprev;
+  int i,j,k,n,itype,jtype,ibin,jbin,icollection,jcollection,which,ns,imol,iatom,moltemplate;
+  tagint itag,jtag,tagprev;
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   int *neighptr,*s;
   int js;
 
+  const double delta = 0.01 * force->angstrom;
+
   int *collection = neighbor->collection;
   double **x = atom->x;
   int *type = atom->type;
@@ -72,6 +75,8 @@ void NPairHalfMultiNewtonTri::build(NeighList *list)
   for (i = 0; i < nlocal; i++) {
     n = 0;
     neighptr = ipage->vget();
+
+    itag = tag[i];
     itype = type[i];
     icollection = collection[i];
     xtmp = x[i][0];
@@ -86,36 +91,51 @@ void NPairHalfMultiNewtonTri::build(NeighList *list)
     ibin = atom2bin[i];
 
     // loop through stencils for all collections
+
     for (jcollection = 0; jcollection < ncollections; jcollection++) {
 
       // if same collection use own bin
+
       if (icollection == jcollection) jbin = ibin;
-          else jbin = coord2bin(x[i], jcollection);
+      else jbin = coord2bin(x[i], jcollection);
 
       // loop over all atoms in bins in stencil
-      // stencil is empty if i larger than j
-      // stencil is half if i same size as j
-      // stencil is full if i smaller than j
-      // if half: pairs for atoms j "below" i are excluded
-      // below = lower z or (equal z and lower y) or (equal zy and lower x)
-      //         (equal zyx and j <= i)
-      // latter excludes self-self interaction but allows superposed atoms
+      // for triclinic:
+      //   stencil is empty if i larger than j
+      //   stencil is full if i smaller than j
+      //   stencil is full if i same size as j
+      // for i smaller than j:
+      //   must use itag/jtag to eliminate half the I/J interactions
+      //   cannot use I/J exact coord comparision
+      //     b/c transforming orthog -> lambda -> orthog for ghost atoms
+      //     with an added PBC offset can shift all 3 coords by epsilon
 
-          s = stencil_multi[icollection][jcollection];
-          ns = nstencil_multi[icollection][jcollection];
+      s = stencil_multi[icollection][jcollection];
+      ns = nstencil_multi[icollection][jcollection];
 
-          for (k = 0; k < ns; k++) {
-            js = binhead_multi[jcollection][jbin + s[k]];
-            for (j = js; j >= 0; j = bins[j]) {
+      for (k = 0; k < ns; k++) {
+        js = binhead_multi[jcollection][jbin + s[k]];
+        for (j = js; j >= 0; j = bins[j]) {
 
-          // if same size (same collection), use half stencil
-          if(cutcollectionsq[icollection][icollection] == cutcollectionsq[jcollection][jcollection]){
-            if (x[j][2] < ztmp) continue;
-            if (x[j][2] == ztmp) {
-              if (x[j][1] < ytmp) continue;
-              if (x[j][1] == ytmp) {
-                if (x[j][0] < xtmp) continue;
-                if (x[j][0] == xtmp && j <= i) continue;
+          // if same size (same collection), exclude half of interactions
+
+          if (cutcollectionsq[icollection][icollection] ==
+              cutcollectionsq[jcollection][jcollection]) {
+            if (j <= i) continue;
+            if (j >= nlocal) {
+              jtag = tag[j];
+              if (itag > jtag) {
+                if ((itag+jtag) % 2 == 0) continue;
+              } else if (itag < jtag) {
+                if ((itag+jtag) % 2 == 1) continue;
+              } else {
+                if (fabs(x[j][2]-ztmp) > delta) {
+                  if (x[j][2] < ztmp) continue;
+                } else if (fabs(x[j][1]-ytmp) > delta) {
+                  if (x[j][1] < ytmp) continue;
+                } else {
+                  if (x[j][0] < xtmp) continue;
+                }
               }
             }
           }
@@ -123,28 +143,28 @@ void NPairHalfMultiNewtonTri::build(NeighList *list)
           jtype = type[j];
           if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
 
-              delx = xtmp - x[j][0];
-              dely = ytmp - x[j][1];
-              delz = ztmp - x[j][2];
-              rsq = delx*delx + dely*dely + delz*delz;
+          delx = xtmp - x[j][0];
+          dely = ytmp - x[j][1];
+          delz = ztmp - x[j][2];
+          rsq = delx*delx + dely*dely + delz*delz;
 
-              if (rsq <= cutneighsq[itype][jtype]) {
-                if (molecular != Atom::ATOMIC) {
-                    if (!moltemplate)
-                      which = find_special(special[i],nspecial[i],tag[j]);
-                    else if (imol >= 0)
-                      which = find_special(onemols[imol]->special[iatom],
-                                       onemols[imol]->nspecial[iatom],
-                                       tag[j]-tagprev);
-                    else which = 0;
-                    if (which == 0) neighptr[n++] = j;
-                    else if (domain->minimum_image_check(delx,dely,delz))
-                      neighptr[n++] = j;
-                    else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
-                } else neighptr[n++] = j;
-              }
-            }
+          if (rsq <= cutneighsq[itype][jtype]) {
+            if (molecular != Atom::ATOMIC) {
+              if (!moltemplate)
+                which = find_special(special[i],nspecial[i],tag[j]);
+              else if (imol >= 0)
+                which = find_special(onemols[imol]->special[iatom],
+                                     onemols[imol]->nspecial[iatom],
+                                     tag[j]-tagprev);
+              else which = 0;
+              if (which == 0) neighptr[n++] = j;
+              else if (domain->minimum_image_check(delx,dely,delz))
+                neighptr[n++] = j;
+              else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+            } else neighptr[n++] = j;
           }
+        }
+      }
     }
 
     ilist[inum++] = i;
diff --git a/src/npair_half_multi_old_newton_tri.cpp b/src/npair_half_multi_old_newton_tri.cpp
index fbb9a8e504..ce3149ebf5 100644
--- a/src/npair_half_multi_old_newton_tri.cpp
+++ b/src/npair_half_multi_old_newton_tri.cpp
@@ -18,6 +18,7 @@
 #include "atom_vec.h"
 #include "domain.h"
 #include "error.h"
+#include "force.h"
 #include "molecule.h"
 #include "my_page.h"
 #include "neigh_list.h"
@@ -38,11 +39,13 @@ NPairHalfMultiOldNewtonTri::NPairHalfMultiOldNewtonTri(LAMMPS *lmp) : NPair(lmp)
 void NPairHalfMultiOldNewtonTri::build(NeighList *list)
 {
   int i,j,k,n,itype,jtype,ibin,which,ns,imol,iatom,moltemplate;
-  tagint tagprev;
+  tagint itag,jtag,tagprev;
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   int *neighptr,*s;
   double *cutsq,*distsq;
 
+  const double delta = 0.01 * force->angstrom;
+
   double **x = atom->x;
   int *type = atom->type;
   int *mask = atom->mask;
@@ -71,6 +74,7 @@ void NPairHalfMultiOldNewtonTri::build(NeighList *list)
     n = 0;
     neighptr = ipage->vget();
 
+    itag = tag[i];
     itype = type[i];
     xtmp = x[i][0];
     ytmp = x[i][1];
@@ -81,13 +85,12 @@ void NPairHalfMultiOldNewtonTri::build(NeighList *list)
       tagprev = tag[i] - iatom - 1;
     }
 
-    // loop over all atoms in bins, including self, in stencil
-    // skip if i,j neighbor cutoff is less than bin distance
-    // bins below self are excluded from stencil
-    // pairs for atoms j "below" i are excluded
-    // below = lower z or (equal z and lower y) or (equal zy and lower x)
-    //         (equal zyx and j <= i)
-    // latter excludes self-self interaction but allows superposed atoms
+    // loop over all atoms in bins in stencil
+    // for triclinic, bin stencil is full in all 3 dims
+    // must use itag/jtag to eliminate half the I/J interactions
+    // cannot use I/J exact coord comparision
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     ibin = atom2bin[i];
     s = stencil_multi_old[itype];
@@ -98,12 +101,22 @@ void NPairHalfMultiOldNewtonTri::build(NeighList *list)
       for (j = binhead[ibin+s[k]]; j >= 0; j = bins[j]) {
         jtype = type[j];
         if (cutsq[jtype] < distsq[k]) continue;
-        if (x[j][2] < ztmp) continue;
-        if (x[j][2] == ztmp) {
-          if (x[j][1] < ytmp) continue;
-          if (x[j][1] == ytmp) {
-            if (x[j][0] < xtmp) continue;
-            if (x[j][0] == xtmp && j <= i) continue;
+
+        if (j <= i) continue;
+        if (j >= nlocal) {
+          jtag = tag[j];
+          if (itag > jtag) {
+            if ((itag+jtag) % 2 == 0) continue;
+          } else if (itag < jtag) {
+            if ((itag+jtag) % 2 == 1) continue;
+          } else {
+            if (fabs(x[j][2]-ztmp) > delta) {
+              if (x[j][2] < ztmp) continue;
+            } else if (fabs(x[j][1]-ytmp) > delta) {
+              if (x[j][1] < ytmp) continue;
+            } else {
+              if (x[j][0] < xtmp) continue;
+            }
           }
         }
 
diff --git a/src/npair_half_nsq_newton.cpp b/src/npair_half_nsq_newton.cpp
index e5f3138f0a..4d5afbdd3e 100644
--- a/src/npair_half_nsq_newton.cpp
+++ b/src/npair_half_nsq_newton.cpp
@@ -13,14 +13,16 @@
 ------------------------------------------------------------------------- */
 
 #include "npair_half_nsq_newton.h"
-#include "neigh_list.h"
+
 #include "atom.h"
 #include "atom_vec.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
 #include "group.h"
 #include "molecule.h"
-#include "domain.h"
 #include "my_page.h"
-#include "error.h"
+#include "neigh_list.h"
 
 using namespace LAMMPS_NS;
 
@@ -41,6 +43,9 @@ void NPairHalfNsqNewton::build(NeighList *list)
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   int *neighptr;
 
+  const double delta = 0.01 * force->angstrom;
+  const int triclinic = domain->triclinic;
+
   double **x = atom->x;
   int *type = atom->type;
   int *mask = atom->mask;
@@ -85,7 +90,12 @@ void NPairHalfNsqNewton::build(NeighList *list)
     }
 
     // loop over remaining atoms, owned and ghost
+    // use itag/jtap comparision to eliminate half the interactions
     // itag = jtag is possible for long cutoffs that include images of self
+    // for triclinic, must use delta to eliminate half the I/J interactions
+    // cannot use I/J exact coord comparision as for orthog
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     for (j = i+1; j < nall; j++) {
       if (includegroup && !(mask[j] & bitmask)) continue;
@@ -96,6 +106,14 @@ void NPairHalfNsqNewton::build(NeighList *list)
           if ((itag+jtag) % 2 == 0) continue;
         } else if (itag < jtag) {
           if ((itag+jtag) % 2 == 1) continue;
+        } else if (triclinic) {
+          if (fabs(x[j][2]-ztmp) > delta) {
+            if (x[j][2] < ztmp) continue;
+          } else if (fabs(x[j][1]-ytmp) > delta) {
+            if (x[j][1] < ytmp) continue;
+          } else {
+            if (x[j][0] < xtmp) continue;
+          }
         } else {
           if (x[j][2] < ztmp) continue;
           if (x[j][2] == ztmp) {
diff --git a/src/npair_half_respa_bin_newton_tri.cpp b/src/npair_half_respa_bin_newton_tri.cpp
index b2749bd7a7..4cd4ead0fa 100644
--- a/src/npair_half_respa_bin_newton_tri.cpp
+++ b/src/npair_half_respa_bin_newton_tri.cpp
@@ -13,13 +13,15 @@
 ------------------------------------------------------------------------- */
 
 #include "npair_half_respa_bin_newton_tri.h"
-#include "neigh_list.h"
+
 #include "atom.h"
 #include "atom_vec.h"
-#include "molecule.h"
 #include "domain.h"
-#include "my_page.h"
 #include "error.h"
+#include "force.h"
+#include "molecule.h"
+#include "my_page.h"
+#include "neigh_list.h"
 
 using namespace LAMMPS_NS;
 
@@ -38,10 +40,12 @@ NPairHalfRespaBinNewtonTri::NPairHalfRespaBinNewtonTri(LAMMPS *lmp) :
 void NPairHalfRespaBinNewtonTri::build(NeighList *list)
 {
   int i,j,k,n,itype,jtype,ibin,n_inner,n_middle,imol,iatom,moltemplate;
-  tagint tagprev;
+  tagint itag,jtag,tagprev;
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   int *neighptr,*neighptr_inner,*neighptr_middle;
 
+  const double delta = 0.01 * force->angstrom;
+
   double **x = atom->x;
   int *type = atom->type;
   int *mask = atom->mask;
@@ -94,6 +98,7 @@ void NPairHalfRespaBinNewtonTri::build(NeighList *list)
       neighptr_middle = ipage_middle->vget();
     }
 
+    itag = tag[i];
     itype = type[i];
     xtmp = x[i][0];
     ytmp = x[i][1];
@@ -105,20 +110,31 @@ void NPairHalfRespaBinNewtonTri::build(NeighList *list)
     }
 
     // loop over all atoms in bins in stencil
-    // pairs for atoms j "below" i are excluded
-    // below = lower z or (equal z and lower y) or (equal zy and lower x)
-    //         (equal zyx and j <= i)
-    // latter excludes self-self interaction but allows superposed atoms
+    // for triclinic, bin stencil is full in all 3 dims
+    // must use itag/jtag to eliminate half the I/J interactions
+    // cannot use I/J exact coord comparision
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     ibin = atom2bin[i];
     for (k = 0; k < nstencil; k++) {
       for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
-        if (x[j][2] < ztmp) continue;
-        if (x[j][2] == ztmp) {
-          if (x[j][1] < ytmp) continue;
-          if (x[j][1] == ytmp) {
-            if (x[j][0] < xtmp) continue;
-            if (x[j][0] == xtmp && j <= i) continue;
+
+        if (j <= i) continue;
+        if (j >= nlocal) {
+          jtag = tag[j];
+          if (itag > jtag) {
+            if ((itag+jtag) % 2 == 0) continue;
+          } else if (itag < jtag) {
+            if ((itag+jtag) % 2 == 1) continue;
+          } else {
+            if (fabs(x[j][2]-ztmp) > delta) {
+              if (x[j][2] < ztmp) continue;
+            } else if (fabs(x[j][1]-ytmp) > delta) {
+              if (x[j][1] < ytmp) continue;
+            } else {
+              if (x[j][0] < xtmp) continue;
+            }
           }
         }
 
diff --git a/src/npair_half_respa_nsq_newton.cpp b/src/npair_half_respa_nsq_newton.cpp
index d231cddb87..ae56d62fb5 100644
--- a/src/npair_half_respa_nsq_newton.cpp
+++ b/src/npair_half_respa_nsq_newton.cpp
@@ -13,14 +13,16 @@
 ------------------------------------------------------------------------- */
 
 #include "npair_half_respa_nsq_newton.h"
-#include "neigh_list.h"
+
 #include "atom.h"
 #include "atom_vec.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
 #include "group.h"
 #include "molecule.h"
-#include "domain.h"
 #include "my_page.h"
-#include "error.h"
+#include "neigh_list.h"
 
 using namespace LAMMPS_NS;
 
@@ -44,6 +46,9 @@ void NPairHalfRespaNsqNewton::build(NeighList *list)
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   int *neighptr,*neighptr_inner,*neighptr_middle;
 
+  const double delta = 0.01 * force->angstrom;
+  const int triclinic = domain->triclinic;
+
   double **x = atom->x;
   int *type = atom->type;
   int *mask = atom->mask;
@@ -112,6 +117,12 @@ void NPairHalfRespaNsqNewton::build(NeighList *list)
     }
 
     // loop over remaining atoms, owned and ghost
+    // use itag/jtap comparision to eliminate half the interactions
+    // itag = jtag is possible for long cutoffs that include images of self
+    // for triclinic, must use delta to eliminate half the I/J interactions
+    // cannot use I/J exact coord comparision as for orthog
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     for (j = i+1; j < nall; j++) {
       if (includegroup && !(mask[j] & bitmask)) continue;
@@ -122,6 +133,14 @@ void NPairHalfRespaNsqNewton::build(NeighList *list)
           if ((itag+jtag) % 2 == 0) continue;
         } else if (itag < jtag) {
           if ((itag+jtag) % 2 == 1) continue;
+        } else if (triclinic) {
+          if (fabs(x[j][2]-ztmp) > delta) {
+            if (x[j][2] < ztmp) continue;
+          } else if (fabs(x[j][1]-ytmp) > delta) {
+            if (x[j][1] < ytmp) continue;
+          } else {
+            if (x[j][0] < xtmp) continue;
+          }
         } else {
           if (x[j][2] < ztmp) continue;
           if (x[j][2] == ztmp) {
diff --git a/src/npair_half_respa_nsq_newton.h b/src/npair_half_respa_nsq_newton.h
index e5233f5e9d..4a5ae23aef 100644
--- a/src/npair_half_respa_nsq_newton.h
+++ b/src/npair_half_respa_nsq_newton.h
@@ -15,7 +15,7 @@
 // clang-format off
 NPairStyle(half/respa/nsq/newton,
            NPairHalfRespaNsqNewton,
-           NP_HALF | NP_RESPA | NP_NSQ | NP_NEWTON | NP_ORTHO);
+           NP_HALF | NP_RESPA | NP_NSQ | NP_NEWTON | NP_ORTHO | NP_TRI);
 // clang-format on
 #else
 
diff --git a/src/npair_half_size_bin_newton_tri.cpp b/src/npair_half_size_bin_newton_tri.cpp
index 47bb9d01e1..0d1a0a7329 100644
--- a/src/npair_half_size_bin_newton_tri.cpp
+++ b/src/npair_half_size_bin_newton_tri.cpp
@@ -18,6 +18,7 @@
 #include "atom_vec.h"
 #include "domain.h"
 #include "error.h"
+#include "force.h"
 #include "molecule.h"
 #include "my_page.h"
 #include "neigh_list.h"
@@ -39,11 +40,13 @@ NPairHalfSizeBinNewtonTri::NPairHalfSizeBinNewtonTri(LAMMPS *lmp) :
 void NPairHalfSizeBinNewtonTri::build(NeighList *list)
 {
   int i,j,jh,k,n,ibin,which,imol,iatom,moltemplate;
-  tagint tagprev;
+  tagint itag,jtag,tagprev;
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   double radi,radsum,cutsq;
   int *neighptr;
 
+  const double delta = 0.01 * force->angstrom;
+
   double **x = atom->x;
   double *radius = atom->radius;
   int *type = atom->type;
@@ -76,6 +79,7 @@ void NPairHalfSizeBinNewtonTri::build(NeighList *list)
     n = 0;
     neighptr = ipage->vget();
 
+    itag = tag[i];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
@@ -87,20 +91,31 @@ void NPairHalfSizeBinNewtonTri::build(NeighList *list)
     }
 
     // loop over all atoms in bins in stencil
-    // pairs for atoms j "below" i are excluded
-    // below = lower z or (equal z and lower y) or (equal zy and lower x)
-    //         (equal zyx and j <= i)
-    // latter excludes self-self interaction but allows superposed atoms
+    // for triclinic, bin stencil is full in all 3 dims
+    // must use itag/jtag to eliminate half the I/J interactions
+    // cannot use I/J exact coord comparision
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     ibin = atom2bin[i];
     for (k = 0; k < nstencil; k++) {
       for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
-        if (x[j][2] < ztmp) continue;
-        if (x[j][2] == ztmp) {
-          if (x[j][1] < ytmp) continue;
-          if (x[j][1] == ytmp) {
-            if (x[j][0] < xtmp) continue;
-            if (x[j][0] == xtmp && j <= i) continue;
+
+        if (j <= i) continue;
+        if (j >= nlocal) {
+          jtag = tag[j];
+          if (itag > jtag) {
+            if ((itag+jtag) % 2 == 0) continue;
+          } else if (itag < jtag) {
+            if ((itag+jtag) % 2 == 1) continue;
+          } else {
+            if (fabs(x[j][2]-ztmp) > delta) {
+              if (x[j][2] < ztmp) continue;
+            } else if (fabs(x[j][1]-ytmp) > delta) {
+              if (x[j][1] < ytmp) continue;
+            } else {
+              if (x[j][0] < xtmp) continue;
+            }
           }
         }
 
diff --git a/src/npair_half_size_multi_newton_tri.cpp b/src/npair_half_size_multi_newton_tri.cpp
index 5d8a0f05ef..aa0d8e3f42 100644
--- a/src/npair_half_size_multi_newton_tri.cpp
+++ b/src/npair_half_size_multi_newton_tri.cpp
@@ -18,6 +18,7 @@
 #include "atom_vec.h"
 #include "domain.h"
 #include "error.h"
+#include "force.h"
 #include "molecule.h"
 #include "my_page.h"
 #include "neighbor.h"
@@ -41,11 +42,13 @@ void NPairHalfSizeMultiNewtonTri::build(NeighList *list)
 {
   int i,j,jh,k,n,itype,jtype,icollection,jcollection,ibin,jbin,ns,js;
   int which,imol,iatom,moltemplate;
-  tagint tagprev;
+  tagint itag,jtag,tagprev;
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   double radi,radsum,cutdistsq;
   int *neighptr,*s;
 
+  const double delta = 0.01 * force->angstrom;
+
   int *collection = neighbor->collection;
   double **x = atom->x;
   double *radius = atom->radius;
@@ -78,6 +81,8 @@ void NPairHalfSizeMultiNewtonTri::build(NeighList *list)
   for (i = 0; i < nlocal; i++) {
     n = 0;
     neighptr = ipage->vget();
+
+    itag = tag[i];
     itype = type[i];
     icollection = collection[i];
     xtmp = x[i][0];
@@ -93,11 +98,13 @@ void NPairHalfSizeMultiNewtonTri::build(NeighList *list)
     ibin = atom2bin[i];
 
     // loop through stencils for all collections
+
     for (jcollection = 0; jcollection < ncollections; jcollection++) {
 
       // if same collection use own bin
+
       if (icollection == jcollection) jbin = ibin;
-          else jbin = coord2bin(x[i], jcollection);
+      else jbin = coord2bin(x[i], jcollection);
 
       // loop over all atoms in bins in stencil
       // stencil is empty if i larger than j
@@ -108,21 +115,32 @@ void NPairHalfSizeMultiNewtonTri::build(NeighList *list)
       //         (equal zyx and j <= i)
       // latter excludes self-self interaction but allows superposed atoms
 
-          s = stencil_multi[icollection][jcollection];
-          ns = nstencil_multi[icollection][jcollection];
+      s = stencil_multi[icollection][jcollection];
+      ns = nstencil_multi[icollection][jcollection];
 
-          for (k = 0; k < ns; k++) {
-            js = binhead_multi[jcollection][jbin + s[k]];
-            for (j = js; j >= 0; j = bins[j]) {
+      for (k = 0; k < ns; k++) {
+        js = binhead_multi[jcollection][jbin + s[k]];
+        for (j = js; j >= 0; j = bins[j]) {
 
-          // if same size (same collection), use half stencil
-          if (cutcollectionsq[icollection][icollection] == cutcollectionsq[jcollection][jcollection]){
-            if (x[j][2] < ztmp) continue;
-            if (x[j][2] == ztmp) {
-              if (x[j][1] < ytmp) continue;
-              if (x[j][1] == ytmp) {
-                if (x[j][0] < xtmp) continue;
-                if (x[j][0] == xtmp && j <= i) continue;
+          // if same size (same collection), exclude half of interactions
+
+          if (cutcollectionsq[icollection][icollection] ==
+              cutcollectionsq[jcollection][jcollection]) {
+            if (j <= i) continue;
+            if (j >= nlocal) {
+              jtag = tag[j];
+              if (itag > jtag) {
+                if ((itag+jtag) % 2 == 0) continue;
+              } else if (itag < jtag) {
+                if ((itag+jtag) % 2 == 1) continue;
+              } else {
+                if (fabs(x[j][2]-ztmp) > delta) {
+                  if (x[j][2] < ztmp) continue;
+                } else if (fabs(x[j][1]-ytmp) > delta) {
+                  if (x[j][1] < ytmp) continue;
+                } else {
+                  if (x[j][0] < xtmp) continue;
+                }
               }
             }
           }
@@ -130,34 +148,34 @@ void NPairHalfSizeMultiNewtonTri::build(NeighList *list)
           jtype = type[j];
           if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
 
-              delx = xtmp - x[j][0];
-              dely = ytmp - x[j][1];
-              delz = ztmp - x[j][2];
-              rsq = delx*delx + dely*dely + delz*delz;
-              radsum = radi + radius[j];
-              cutdistsq = (radsum+skin) * (radsum+skin);
+          delx = xtmp - x[j][0];
+          dely = ytmp - x[j][1];
+          delz = ztmp - x[j][2];
+          rsq = delx*delx + dely*dely + delz*delz;
+          radsum = radi + radius[j];
+          cutdistsq = (radsum+skin) * (radsum+skin);
 
-              if (rsq <= cutdistsq) {
-                jh = j;
-                if (history && rsq < radsum*radsum)
-                  jh = jh ^ mask_history;
+          if (rsq <= cutdistsq) {
+            jh = j;
+            if (history && rsq < radsum*radsum)
+              jh = jh ^ mask_history;
 
-                if (molecular != Atom::ATOMIC) {
-                  if (!moltemplate)
-                    which = find_special(special[i],nspecial[i],tag[j]);
-                  else if (imol >= 0)
-                    which = find_special(onemols[imol]->special[iatom],
-                                         onemols[imol]->nspecial[iatom],
-                                         tag[j]-tagprev);
-                  else which = 0;
-                  if (which == 0) neighptr[n++] = jh;
-                  else if (domain->minimum_image_check(delx,dely,delz))
-                    neighptr[n++] = jh;
-                  else if (which > 0) neighptr[n++] = jh ^ (which << SBBITS);
-                } else neighptr[n++] = jh;
-              }
-            }
+            if (molecular != Atom::ATOMIC) {
+              if (!moltemplate)
+                which = find_special(special[i],nspecial[i],tag[j]);
+              else if (imol >= 0)
+                which = find_special(onemols[imol]->special[iatom],
+                                     onemols[imol]->nspecial[iatom],
+                                     tag[j]-tagprev);
+              else which = 0;
+              if (which == 0) neighptr[n++] = jh;
+              else if (domain->minimum_image_check(delx,dely,delz))
+                neighptr[n++] = jh;
+              else if (which > 0) neighptr[n++] = jh ^ (which << SBBITS);
+            } else neighptr[n++] = jh;
           }
+        }
+      }
     }
 
     ilist[inum++] = i;
diff --git a/src/npair_half_size_multi_old_newton_tri.cpp b/src/npair_half_size_multi_old_newton_tri.cpp
index ea3f271956..848a19aa39 100644
--- a/src/npair_half_size_multi_old_newton_tri.cpp
+++ b/src/npair_half_size_multi_old_newton_tri.cpp
@@ -18,6 +18,7 @@
 #include "atom_vec.h"
 #include "domain.h"
 #include "error.h"
+#include "force.h"
 #include "molecule.h"
 #include "my_page.h"
 #include "neigh_list.h"
@@ -38,12 +39,14 @@ NPairHalfSizeMultiOldNewtonTri::NPairHalfSizeMultiOldNewtonTri(LAMMPS *lmp) : NP
 void NPairHalfSizeMultiOldNewtonTri::build(NeighList *list)
 {
   int i,j,jh,k,n,itype,jtype,ibin,ns,which,imol,iatom,moltemplate;
-  tagint tagprev;
+  tagint itag,jtag,tagprev;
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   double radi,radsum,cutdistsq;
   int *neighptr,*s;
   double *cutsq,*distsq;
 
+  const double delta = 0.01 * force->angstrom;
+
   double **x = atom->x;
   double *radius = atom->radius;
   int *type = atom->type;
@@ -76,6 +79,7 @@ void NPairHalfSizeMultiOldNewtonTri::build(NeighList *list)
     n = 0;
     neighptr = ipage->vget();
 
+    itag = tag[i];
     itype = type[i];
     xtmp = x[i][0];
     ytmp = x[i][1];
@@ -87,13 +91,12 @@ void NPairHalfSizeMultiOldNewtonTri::build(NeighList *list)
       tagprev = tag[i] - iatom - 1;
     }
 
-    // loop over all atoms in bins, including self, in stencil
-    // skip if i,j neighbor cutoff is less than bin distance
-    // bins below self are excluded from stencil
-    // pairs for atoms j "below" i are excluded
-    // below = lower z or (equal z and lower y) or (equal zy and lower x)
-    //         (equal zyx and j <= i)
-    // latter excludes self-self interaction but allows superposed atoms
+    // loop over all atoms in bins in stencil
+    // for triclinic, bin stencil is full in all 3 dims
+    // must use itag/jtag to eliminate half the I/J interactions
+    // cannot use I/J exact coord comparision
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     ibin = atom2bin[i];
     s = stencil_multi_old[itype];
@@ -104,12 +107,22 @@ void NPairHalfSizeMultiOldNewtonTri::build(NeighList *list)
       for (j = binhead[ibin+s[k]]; j >= 0; j = bins[j]) {
         jtype = type[j];
         if (cutsq[jtype] < distsq[k]) continue;
-        if (x[j][2] < ztmp) continue;
-        if (x[j][2] == ztmp) {
-          if (x[j][1] < ytmp) continue;
-          if (x[j][1] == ytmp) {
-            if (x[j][0] < xtmp) continue;
-            if (x[j][0] == xtmp && j <= i) continue;
+
+        if (j <= i) continue;
+        if (j >= nlocal) {
+          jtag = tag[j];
+          if (itag > jtag) {
+            if ((itag+jtag) % 2 == 0) continue;
+          } else if (itag < jtag) {
+            if ((itag+jtag) % 2 == 1) continue;
+          } else {
+            if (fabs(x[j][2]-ztmp) > delta) {
+              if (x[j][2] < ztmp) continue;
+            } else if (fabs(x[j][1]-ytmp) > delta) {
+              if (x[j][1] < ytmp) continue;
+            } else {
+              if (x[j][0] < xtmp) continue;
+            }
           }
         }
 
diff --git a/src/npair_half_size_nsq_newton.cpp b/src/npair_half_size_nsq_newton.cpp
index 93d65c7a45..ce0c7f9562 100644
--- a/src/npair_half_size_nsq_newton.cpp
+++ b/src/npair_half_size_nsq_newton.cpp
@@ -18,6 +18,7 @@
 #include "atom_vec.h"
 #include "domain.h"
 #include "error.h"
+#include "force.h"
 #include "molecule.h"
 #include "group.h"
 #include "my_page.h"
@@ -45,6 +46,9 @@ void NPairHalfSizeNsqNewton::build(NeighList *list)
   double radi,radsum,cutsq;
   int *neighptr;
 
+  const double delta = 0.01 * force->angstrom;
+  const int triclinic = domain->triclinic;
+
   double **x = atom->x;
   double *radius = atom->radius;
   tagint *tag = atom->tag;
@@ -93,6 +97,12 @@ void NPairHalfSizeNsqNewton::build(NeighList *list)
     }
 
     // loop over remaining atoms, owned and ghost
+    // use itag/jtap comparision to eliminate half the interactions
+    // itag = jtag is possible for long cutoffs that include images of self
+    // for triclinic, must use delta to eliminate half the I/J interactions
+    // cannot use I/J exact coord comparision as for orthog
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     for (j = i+1; j < nall; j++) {
       if (includegroup && !(mask[j] & bitmask)) continue;
@@ -103,6 +113,14 @@ void NPairHalfSizeNsqNewton::build(NeighList *list)
           if ((itag+jtag) % 2 == 0) continue;
         } else if (itag < jtag) {
           if ((itag+jtag) % 2 == 1) continue;
+        } else if (triclinic) {
+          if (fabs(x[j][2]-ztmp) > delta) {
+            if (x[j][2] < ztmp) continue;
+          } else if (fabs(x[j][1]-ytmp) > delta) {
+            if (x[j][1] < ytmp) continue;
+          } else {
+            if (x[j][0] < xtmp) continue;
+          }
         } else {
           if (x[j][2] < ztmp) continue;
           if (x[j][2] == ztmp) {
diff --git a/src/npair_halffull_newton.cpp b/src/npair_halffull_newton.cpp
index 407a71e614..12320c46f3 100644
--- a/src/npair_halffull_newton.cpp
+++ b/src/npair_halffull_newton.cpp
@@ -14,7 +14,9 @@
 #include "npair_halffull_newton.h"
 
 #include "atom.h"
+#include "domain.h"
 #include "error.h"
+#include "force.h"
 #include "my_page.h"
 #include "neigh_list.h"
 
@@ -37,6 +39,9 @@ void NPairHalffullNewton::build(NeighList *list)
   int *neighptr, *jlist;
   double xtmp, ytmp, ztmp;
 
+  const double delta = 0.01 * force->angstrom;
+  const int triclinic = domain->triclinic;
+
   double **x = atom->x;
   int nlocal = atom->nlocal;
 
@@ -65,6 +70,11 @@ void NPairHalffullNewton::build(NeighList *list)
     ztmp = x[i][2];
 
     // loop over full neighbor list
+    // use i < j < nlocal to eliminate half the local/local interactions
+    // for triclinic, must use delta to eliminate half the local/ghost interactions
+    // cannot use I/J exact coord comparision as for orthog
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     jlist = firstneigh_full[i];
     jnum = numneigh_full[i];
@@ -72,8 +82,17 @@ void NPairHalffullNewton::build(NeighList *list)
     for (jj = 0; jj < jnum; jj++) {
       joriginal = jlist[jj];
       j = joriginal & NEIGHMASK;
+
       if (j < nlocal) {
         if (i > j) continue;
+      } else if (triclinic) {
+        if (fabs(x[j][2]-ztmp) > delta) {
+          if (x[j][2] < ztmp) continue;
+        } else if (fabs(x[j][1]-ytmp) > delta) {
+          if (x[j][1] < ytmp) continue;
+        } else {
+          if (x[j][0] < xtmp) continue;
+        }
       } else {
         if (x[j][2] < ztmp) continue;
         if (x[j][2] == ztmp) {
@@ -81,6 +100,7 @@ void NPairHalffullNewton::build(NeighList *list)
           if (x[j][1] == ytmp && x[j][0] < xtmp) continue;
         }
       }
+
       neighptr[n++] = joriginal;
     }
 
diff --git a/src/npair_halffull_newton_trim.cpp b/src/npair_halffull_newton_trim.cpp
index b7bb72c990..e758c04284 100644
--- a/src/npair_halffull_newton_trim.cpp
+++ b/src/npair_halffull_newton_trim.cpp
@@ -14,7 +14,9 @@
 #include "npair_halffull_newton_trim.h"
 
 #include "atom.h"
+#include "domain.h"
 #include "error.h"
+#include "force.h"
 #include "my_page.h"
 #include "neigh_list.h"
 
@@ -38,6 +40,9 @@ void NPairHalffullNewtonTrim::build(NeighList *list)
   double xtmp, ytmp, ztmp;
   double delx, dely, delz, rsq;
 
+  const double delta = 0.01 * force->angstrom;
+  const int triclinic = domain->triclinic;
+
   double **x = atom->x;
   int nlocal = atom->nlocal;
 
@@ -68,6 +73,11 @@ void NPairHalffullNewtonTrim::build(NeighList *list)
     ztmp = x[i][2];
 
     // loop over full neighbor list
+    // use i < j < nlocal to eliminate half the local/local interactions
+    // for triclinic, must use delta to eliminate half the local/ghost interactions
+    // cannot use I/J exact coord comparision as for orthog
+    //   b/c transforming orthog -> lambda -> orthog for ghost atoms
+    //   with an added PBC offset can shift all 3 coords by epsilon
 
     jlist = firstneigh_full[i];
     jnum = numneigh_full[i];
@@ -75,8 +85,17 @@ void NPairHalffullNewtonTrim::build(NeighList *list)
     for (jj = 0; jj < jnum; jj++) {
       joriginal = jlist[jj];
       j = joriginal & NEIGHMASK;
+
       if (j < nlocal) {
         if (i > j) continue;
+      } else if (triclinic) {
+        if (fabs(x[j][2]-ztmp) > delta) {
+          if (x[j][2] < ztmp) continue;
+        } else if (fabs(x[j][1]-ytmp) > delta) {
+          if (x[j][1] < ytmp) continue;
+        } else {
+          if (x[j][0] < xtmp) continue;
+        }
       } else {
         if (x[j][2] < ztmp) continue;
         if (x[j][2] == ztmp) {
diff --git a/src/npair_trim.cpp b/src/npair_trim.cpp
index 14974d72ab..a4b6c1c6a1 100644
--- a/src/npair_trim.cpp
+++ b/src/npair_trim.cpp
@@ -50,11 +50,15 @@ void NPairTrim::build(NeighList *list)
   int *numneigh_copy = listcopy->numneigh;
   int **firstneigh_copy = listcopy->firstneigh;
   int inum = listcopy->inum;
+  int gnum = listcopy->gnum;
 
   list->inum = inum;
-  list->gnum = listcopy->gnum;
+  list->gnum = gnum;
 
-  for (ii = 0; ii < inum; ii++) {
+  int inum_trim = inum;
+  if (list->ghost) inum_trim += gnum;
+
+  for (ii = 0; ii < inum_trim; ii++) {
     n = 0;
     neighptr = ipage->vget();
 
diff --git a/src/nstencil_half_bin_2d_tri.cpp b/src/nstencil_half_bin_2d_tri.cpp
index 06831730fd..920918fe09 100644
--- a/src/nstencil_half_bin_2d_tri.cpp
+++ b/src/nstencil_half_bin_2d_tri.cpp
@@ -27,9 +27,17 @@ void NStencilHalfBin2dTri::create()
 {
   int i, j;
 
+  // for triclinic, need to use full stencil in all dims
+  //   not a half stencil in y
+  // b/c transforming orthog -> lambda -> orthog for ghost atoms
+  //   with an added PBC offset can shift both coords by epsilon
+  // thus for an I/J owned/ghost pair, the xy coords
+  //   and bin assignments can be different on I proc vs J proc
+
   nstencil = 0;
 
-  for (j = 0; j <= sy; j++)
+  for (j = -sy; j <= sy; j++)
     for (i = -sx; i <= sx; i++)
-      if (bin_distance(i, j, 0) < cutneighmaxsq) stencil[nstencil++] = j * mbinx + i;
+      if (bin_distance(i, j, 0) < cutneighmaxsq)
+	stencil[nstencil++] = j * mbinx + i;
 }
diff --git a/src/nstencil_half_bin_3d_tri.cpp b/src/nstencil_half_bin_3d_tri.cpp
index d066a24ee6..72bef7fb76 100644
--- a/src/nstencil_half_bin_3d_tri.cpp
+++ b/src/nstencil_half_bin_3d_tri.cpp
@@ -27,9 +27,16 @@ void NStencilHalfBin3dTri::create()
 {
   int i, j, k;
 
+  // for triclinic, need to use full stencil in all dims
+  //   not a half stencil in z
+  // b/c transforming orthog -> lambda -> orthog for ghost atoms
+  //   with an added PBC offset can shift all 3 coords by epsilon
+  // thus for an I/J owned/ghost pair, the xyz coords
+  //   and bin assignments can be different on I proc vs J proc
+
   nstencil = 0;
 
-  for (k = 0; k <= sz; k++)
+  for (k = -sz; k <= sz; k++)
     for (j = -sy; j <= sy; j++)
       for (i = -sx; i <= sx; i++)
         if (bin_distance(i, j, k) < cutneighmaxsq)
diff --git a/src/nstencil_half_multi_2d_tri.cpp b/src/nstencil_half_multi_2d_tri.cpp
index bf39c04099..85bbe94c86 100644
--- a/src/nstencil_half_multi_2d_tri.cpp
+++ b/src/nstencil_half_multi_2d_tri.cpp
@@ -80,7 +80,7 @@ void NStencilHalfMulti2dTri::create()
       cutsq = cutcollectionsq[icollection][jcollection];
 
       if (flag_half_multi[icollection][jcollection]) {
-        for (j = 0; j <= sy; j++)
+        for (j = -sy; j <= sy; j++)
           for (i = -sx; i <= sx; i++)
             if (bin_distance_multi(i, j, 0, bin_collection) < cutsq)
               stencil_multi[icollection][jcollection][ns++] = j * mbinx + i;
diff --git a/src/nstencil_half_multi_3d_tri.cpp b/src/nstencil_half_multi_3d_tri.cpp
index f2d4d051ad..9761e15854 100644
--- a/src/nstencil_half_multi_3d_tri.cpp
+++ b/src/nstencil_half_multi_3d_tri.cpp
@@ -81,7 +81,7 @@ void NStencilHalfMulti3dTri::create()
       cutsq = cutcollectionsq[icollection][jcollection];
 
       if (flag_half_multi[icollection][jcollection]) {
-        for (k = 0; k <= sz; k++)
+        for (k = -sz; k <= sz; k++)
           for (j = -sy; j <= sy; j++)
             for (i = -sx; i <= sx; i++)
               if (bin_distance_multi(i, j, k, bin_collection) < cutsq)
diff --git a/src/nstencil_half_multi_old_2d_tri.cpp b/src/nstencil_half_multi_old_2d_tri.cpp
index 1438aef843..0aeb65bebd 100644
--- a/src/nstencil_half_multi_old_2d_tri.cpp
+++ b/src/nstencil_half_multi_old_2d_tri.cpp
@@ -37,7 +37,7 @@ void NStencilHalfMultiOld2dTri::create()
     s = stencil_multi_old[itype];
     distsq = distsq_multi_old[itype];
     n = 0;
-    for (j = 0; j <= sy; j++)
+    for (j = -sy; j <= sy; j++)
       for (i = -sx; i <= sx; i++) {
         rsq = bin_distance(i, j, 0);
         if (rsq < typesq) {
diff --git a/src/nstencil_half_multi_old_3d_tri.cpp b/src/nstencil_half_multi_old_3d_tri.cpp
index 836eee6039..3717b7836b 100644
--- a/src/nstencil_half_multi_old_3d_tri.cpp
+++ b/src/nstencil_half_multi_old_3d_tri.cpp
@@ -37,7 +37,7 @@ void NStencilHalfMultiOld3dTri::create()
     s = stencil_multi_old[itype];
     distsq = distsq_multi_old[itype];
     n = 0;
-    for (k = 0; k <= sz; k++)
+    for (k = -sz; k <= sz; k++)
       for (j = -sy; j <= sy; j++)
         for (i = -sx; i <= sx; i++) {
           rsq = bin_distance(i, j, k);
diff --git a/src/variable.cpp b/src/variable.cpp
index cf2e5c3b6f..264dcf6258 100644
--- a/src/variable.cpp
+++ b/src/variable.cpp
@@ -1469,8 +1469,7 @@ double Variable::evaluate(char *str, Tree **tree, int ivar)
         if (domain->box_exist == 0)
           print_var_error(FLERR,"Variable evaluation before simulation box is defined",ivar);
 
-        // uppercase used to force access of
-        // global vector vs global scalar, and global array vs global vector
+        // uppercase used to access of peratom data by equal-style var
 
         int lowercase = 1;
         if (word[0] == 'C') lowercase = 0;
@@ -1479,7 +1478,6 @@ double Variable::evaluate(char *str, Tree **tree, int ivar)
         if (!compute)
           print_var_error(FLERR,fmt::format("Invalid compute ID '{}' in variable formula", word+2),ivar);
 
-
         // parse zero or one or two trailing brackets
         // point i beyond last bracket
         // nbracket = # of bracket pairs
@@ -1501,218 +1499,234 @@ double Variable::evaluate(char *str, Tree **tree, int ivar)
           }
         }
 
-        // c_ID = scalar from global scalar, must be lowercase
+        // equal-style variable is being evaluated
 
-        if (nbracket == 0 && compute->scalar_flag && lowercase) {
+        if (style[ivar] == EQUAL) {
 
-          if (!compute->is_initialized())
-            print_var_error(FLERR,"Variable formula compute cannot be invoked before "
-                            "initialization by a run",ivar);
-          if (!(compute->invoked_flag & Compute::INVOKED_SCALAR)) {
-            compute->compute_scalar();
-            compute->invoked_flag |= Compute::INVOKED_SCALAR;
-          }
+          // c_ID = scalar from global scalar
 
-          value1 = compute->scalar;
-          if (tree) {
-            auto newtree = new Tree();
-            newtree->type = VALUE;
-            newtree->value = value1;
-            treestack[ntreestack++] = newtree;
-          } else argstack[nargstack++] = value1;
+          if (lowercase && nbracket == 0) {
 
-        // c_ID[i] = scalar from global vector, must be lowercase
+            if (!compute->scalar_flag)
+              print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
+            if (!compute->is_initialized())
+              print_var_error(FLERR,"Variable formula compute cannot be invoked before "
+                              "initialization by a run",ivar);
+            if (!(compute->invoked_flag & Compute::INVOKED_SCALAR)) {
+              compute->compute_scalar();
+              compute->invoked_flag |= Compute::INVOKED_SCALAR;
+            }
 
-        } else if (nbracket == 1 && compute->vector_flag && lowercase) {
+            value1 = compute->scalar;
+            argstack[nargstack++] = value1;
 
-          if (index1 > compute->size_vector &&
-              compute->size_vector_variable == 0)
-            print_var_error(FLERR,"Variable formula compute vector is accessed out-of-range",ivar,0);
-          if (!compute->is_initialized())
-            print_var_error(FLERR,"Variable formula compute cannot be invoked before "
-                            "initialization by a run",ivar);
-          if (!(compute->invoked_flag & Compute::INVOKED_VECTOR)) {
-            compute->compute_vector();
-            compute->invoked_flag |= Compute::INVOKED_VECTOR;
-          }
+          // c_ID[i] = scalar from global vector
+
+          } else if (lowercase && nbracket == 1) {
+
+            if (!compute->vector_flag)
+              print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
+            if (index1 > compute->size_vector &&
+                compute->size_vector_variable == 0)
+              print_var_error(FLERR,"Variable formula compute vector is accessed out-of-range",ivar,0);
+            if (!compute->is_initialized())
+              print_var_error(FLERR,"Variable formula compute cannot be invoked before "
+                              "initialization by a run",ivar);
+            if (!(compute->invoked_flag & Compute::INVOKED_VECTOR)) {
+              compute->compute_vector();
+              compute->invoked_flag |= Compute::INVOKED_VECTOR;
+            }
 
           if (compute->size_vector_variable &&
               index1 > compute->size_vector) value1 = 0.0;
           else value1 = compute->vector[index1-1];
-          if (tree) {
+          argstack[nargstack++] = value1;
+
+          // c_ID[i][j] = scalar from global array
+
+          } else if (lowercase && nbracket == 2) {
+
+            if (!compute->array_flag)
+              print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
+            if (index1 > compute->size_array_rows &&
+                compute->size_array_rows_variable == 0)
+              print_var_error(FLERR,"Variable formula compute array is accessed out-of-range",ivar,0);
+            if (index2 > compute->size_array_cols)
+              print_var_error(FLERR,"Variable formula compute array is accessed out-of-range",ivar,0);
+            if (!compute->is_initialized())
+              print_var_error(FLERR,"Variable formula compute cannot be invoked before "
+                              "initialization by a run",ivar);
+            if (!(compute->invoked_flag & Compute::INVOKED_ARRAY)) {
+              compute->compute_array();
+              compute->invoked_flag |= Compute::INVOKED_ARRAY;
+            }
+
+            if (compute->size_array_rows_variable &&
+                index1 > compute->size_array_rows) value1 = 0.0;
+            else value1 = compute->array[index1-1][index2-1];
+            argstack[nargstack++] = value1;
+
+          // C_ID[i] = scalar element of per-atom vector, note uppercase "C"
+
+          } else if (!lowercase && nbracket == 1) {
+
+            if (!compute->peratom_flag)
+              print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
+            if (compute->size_peratom_cols)
+              print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
+            if (!compute->is_initialized())
+              print_var_error(FLERR,"Variable formula compute cannot be invoked before "
+                              "initialization by a run",ivar);
+            if (!(compute->invoked_flag & Compute::INVOKED_PERATOM)) {
+              compute->compute_peratom();
+              compute->invoked_flag |= Compute::INVOKED_PERATOM;
+            }
+
+            peratom2global(1,nullptr,compute->vector_atom,1,index1,tree,
+                           treestack,ntreestack,argstack,nargstack);
+
+          // C_ID[i][j] = scalar element of per-atom array, note uppercase "C"
+
+          } else if (!lowercase && nbracket == 2) {
+
+            if (!compute->peratom_flag)
+              print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
+            if (!compute->size_peratom_cols)
+              print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
+            if (index2 > compute->size_peratom_cols)
+              print_var_error(FLERR,"Variable formula compute array is accessed out-of-range",ivar,0);
+            if (!compute->is_initialized())
+              print_var_error(FLERR,"Variable formula compute cannot be invoked before "
+                              "initialization by a run",ivar);
+            if (!(compute->invoked_flag & Compute::INVOKED_PERATOM)) {
+              compute->compute_peratom();
+              compute->invoked_flag |= Compute::INVOKED_PERATOM;
+            }
+
+            if (compute->array_atom)
+              peratom2global(1,nullptr,&compute->array_atom[0][index2-1],
+                             compute->size_peratom_cols,index1,
+                             tree,treestack,ntreestack,argstack,nargstack);
+            else
+              peratom2global(1,nullptr,nullptr,compute->size_peratom_cols,index1,
+                             tree,treestack,ntreestack,argstack,nargstack);
+
+          // no other possibilities for equal-style variable, so error
+
+          } else print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
+
+        // vector-style variable is being evaluated
+
+        } else if (style[ivar] == VECTOR) {
+
+          // c_ID = vector from global vector
+
+          if (lowercase && nbracket == 0) {
+
+            if (!compute->vector_flag)
+              print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
+            if (compute->size_vector == 0)
+              print_var_error(FLERR,"Variable formula compute vector is zero length",ivar);
+            if (!compute->is_initialized())
+              print_var_error(FLERR,"Variable formula compute cannot be invoked before "
+                              "initialization by a run",ivar);
+            if (!(compute->invoked_flag & Compute::INVOKED_VECTOR)) {
+              compute->compute_vector();
+              compute->invoked_flag |= Compute::INVOKED_VECTOR;
+            }
+
             auto newtree = new Tree();
-            newtree->type = VALUE;
-            newtree->value = value1;
+            newtree->type = VECTORARRAY;
+            newtree->array = compute->vector;
+            newtree->nvector = compute->size_vector;
+            newtree->nstride = 1;
             treestack[ntreestack++] = newtree;
-          } else argstack[nargstack++] = value1;
 
-        // c_ID[i][j] = scalar from global array, must be lowercase
+          // c_ID[i] = vector from global array
 
-        } else if (nbracket == 2 && compute->array_flag && lowercase) {
+          } else if (lowercase && nbracket == 1) {
 
-          if (index1 > compute->size_array_rows &&
-              compute->size_array_rows_variable == 0)
-            print_var_error(FLERR,"Variable formula compute array is accessed out-of-range",ivar,0);
-          if (index2 > compute->size_array_cols)
-            print_var_error(FLERR,"Variable formula compute array is accessed out-of-range",ivar,0);
-          if (!compute->is_initialized())
-            print_var_error(FLERR,"Variable formula compute cannot be invoked before "
-                            "initialization by a run",ivar);
-          if (!(compute->invoked_flag & Compute::INVOKED_ARRAY)) {
-            compute->compute_array();
-            compute->invoked_flag |= Compute::INVOKED_ARRAY;
-          }
+            if (!compute->array_flag)
+              print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
+            if (compute->size_array_rows == 0)
+              print_var_error(FLERR,"Variable formula compute array is zero length",ivar);
+            if (index1 > compute->size_array_cols)
+              print_var_error(FLERR,"Variable formula compute array is accessed out-of-range",ivar,0);
+            if (!compute->is_initialized())
+              print_var_error(FLERR,"Variable formula compute cannot be invoked before "
+                              "initialization by a run",ivar);
+            if (!(compute->invoked_flag & Compute::INVOKED_ARRAY)) {
+              compute->compute_array();
+              compute->invoked_flag |= Compute::INVOKED_ARRAY;
+            }
 
-          if (compute->size_array_rows_variable &&
-              index1 > compute->size_array_rows) value1 = 0.0;
-          else value1 = compute->array[index1-1][index2-1];
-          if (tree) {
             auto newtree = new Tree();
-            newtree->type = VALUE;
-            newtree->value = value1;
+            newtree->type = VECTORARRAY;
+            newtree->array = &compute->array[0][index1-1];
+            newtree->nvector = compute->size_array_rows;
+            newtree->nstride = compute->size_array_cols;
             treestack[ntreestack++] = newtree;
-          } else argstack[nargstack++] = value1;
 
-        // c_ID = vector from global vector, lowercase or uppercase
+          // no other possibilities for vector-style variable, so error
 
-        } else if (nbracket == 0 && compute->vector_flag) {
+          } else print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
 
-          if (tree == nullptr)
-            print_var_error(FLERR,"Compute global vector in equal-style variable formula",ivar);
-          if (treetype == ATOM)
-            print_var_error(FLERR,"Compute global vector in atom-style variable formula",ivar);
-          if (compute->size_vector == 0)
-            print_var_error(FLERR,"Variable formula compute vector is zero length",ivar);
-          if (!compute->is_initialized())
-            print_var_error(FLERR,"Variable formula compute cannot be invoked before "
-                            "initialization by a run",ivar);
-          if (!(compute->invoked_flag & Compute::INVOKED_VECTOR)) {
-            compute->compute_vector();
-            compute->invoked_flag |= Compute::INVOKED_VECTOR;
-          }
+        // atom-style variable is being evaluated
 
-          auto newtree = new Tree();
-          newtree->type = VECTORARRAY;
-          newtree->array = compute->vector;
-          newtree->nvector = compute->size_vector;
-          newtree->nstride = 1;
-          treestack[ntreestack++] = newtree;
+        } else if (style[ivar] == ATOM) {
 
-        // c_ID[i] = vector from global array, lowercase or uppercase
+          // c_ID = vector from per-atom vector
 
-        } else if (nbracket == 1 && compute->array_flag) {
+          if (lowercase && nbracket == 0) {
 
-          if (tree == nullptr)
-            print_var_error(FLERR,"Compute global vector in equal-style variable formula",ivar);
-          if (treetype == ATOM)
-            print_var_error(FLERR,"Compute global vector in atom-style variable formula",ivar);
-          if (compute->size_array_rows == 0)
-            print_var_error(FLERR,"Variable formula compute array is zero length",ivar);
-          if (!compute->is_initialized())
-            print_var_error(FLERR,"Variable formula compute cannot be invoked before "
-                            "initialization by a run",ivar);
-          if (!(compute->invoked_flag & Compute::INVOKED_ARRAY)) {
-            compute->compute_array();
-            compute->invoked_flag |= Compute::INVOKED_ARRAY;
-          }
+            if (!compute->peratom_flag)
+              print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
+            if (compute->size_peratom_cols)
+              print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
+            if (!compute->is_initialized())
+              print_var_error(FLERR,"Variable formula compute cannot be invoked before "
+                              "initialization by a run",ivar);
+            if (!(compute->invoked_flag & Compute::INVOKED_PERATOM)) {
+              compute->compute_peratom();
+              compute->invoked_flag |= Compute::INVOKED_PERATOM;
+            }
 
-          auto newtree = new Tree();
-          newtree->type = VECTORARRAY;
-          newtree->array = &compute->array[0][index1-1];
-          newtree->nvector = compute->size_array_rows;
-          newtree->nstride = compute->size_array_cols;
-          treestack[ntreestack++] = newtree;
+            auto newtree = new Tree();
+            newtree->type = ATOMARRAY;
+            newtree->array = compute->vector_atom;
+            newtree->nstride = 1;
+            treestack[ntreestack++] = newtree;
 
-        // c_ID[i] = scalar from per-atom vector
+          // c_ID[i] = vector from per-atom array
 
-        } else if (nbracket == 1 && compute->peratom_flag &&
-                   compute->size_peratom_cols == 0) {
+          } else if (lowercase && nbracket == 1) {
 
-          if (!compute->is_initialized())
-            print_var_error(FLERR,"Variable formula compute cannot be invoked before "
-                            "initialization by a run",ivar);
-          if (!(compute->invoked_flag & Compute::INVOKED_PERATOM)) {
-            compute->compute_peratom();
-            compute->invoked_flag |= Compute::INVOKED_PERATOM;
-          }
+            if (!compute->peratom_flag)
+              print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
+            if (!compute->size_peratom_cols)
+              print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
+            if (index1 > compute->size_peratom_cols)
+              print_var_error(FLERR,"Variable formula compute array is accessed out-of-range",ivar,0);
+            if (!compute->is_initialized())
+              print_var_error(FLERR,"Variable formula compute cannot be invoked before "
+                              "initialization by a run",ivar);
+            if (!(compute->invoked_flag & Compute::INVOKED_PERATOM)) {
+              compute->compute_peratom();
+              compute->invoked_flag |= Compute::INVOKED_PERATOM;
+            }
 
-          peratom2global(1,nullptr,compute->vector_atom,1,index1,tree,
-                         treestack,ntreestack,argstack,nargstack);
+            auto newtree = new Tree();
+            newtree->type = ATOMARRAY;
+            newtree->array = nullptr;
+            if (compute->array_atom)
+              newtree->array = &compute->array_atom[0][index1-1];
+            newtree->nstride = compute->size_peratom_cols;
+            treestack[ntreestack++] = newtree;
 
-        // c_ID[i][j] = scalar from per-atom array
+          // no other possibilities for atom-style variable, so error
 
-        } else if (nbracket == 2 && compute->peratom_flag &&
-                   compute->size_peratom_cols > 0) {
-
-          if (index2 > compute->size_peratom_cols)
-            print_var_error(FLERR,"Variable formula compute array is accessed out-of-range",ivar,0);
-          if (!compute->is_initialized())
-            print_var_error(FLERR,"Variable formula compute cannot be invoked before "
-                            "initialization by a run",ivar);
-          if (!(compute->invoked_flag & Compute::INVOKED_PERATOM)) {
-            compute->compute_peratom();
-            compute->invoked_flag |= Compute::INVOKED_PERATOM;
-          }
-
-          if (compute->array_atom)
-            peratom2global(1,nullptr,&compute->array_atom[0][index2-1],compute->size_peratom_cols,index1,
-                           tree,treestack,ntreestack,argstack,nargstack);
-          else
-            peratom2global(1,nullptr,nullptr,compute->size_peratom_cols,index1,
-                           tree,treestack,ntreestack,argstack,nargstack);
-
-        // c_ID = vector from per-atom vector
-
-        } else if (nbracket == 0 && compute->peratom_flag &&
-                   compute->size_peratom_cols == 0) {
-
-          if (tree == nullptr)
-            print_var_error(FLERR,"Per-atom compute in equal-style variable formula",ivar);
-          if (treetype == VECTOR)
-            print_var_error(FLERR,"Per-atom compute in vector-style variable formula",ivar);
-          if (!compute->is_initialized())
-            print_var_error(FLERR,"Variable formula compute cannot be invoked before "
-                            "initialization by a run",ivar);
-          if (!(compute->invoked_flag & Compute::INVOKED_PERATOM)) {
-            compute->compute_peratom();
-            compute->invoked_flag |= Compute::INVOKED_PERATOM;
-          }
-
-          auto newtree = new Tree();
-          newtree->type = ATOMARRAY;
-          newtree->array = compute->vector_atom;
-          newtree->nstride = 1;
-          treestack[ntreestack++] = newtree;
-
-        // c_ID[i] = vector from per-atom array
-
-        } else if (nbracket == 1 && compute->peratom_flag &&
-                   compute->size_peratom_cols > 0) {
-
-          if (tree == nullptr)
-            print_var_error(FLERR,"Per-atom compute in equal-style variable formula",ivar);
-          if (treetype == VECTOR)
-            print_var_error(FLERR,"Per-atom compute in vector-style variable formula",ivar);
-          if (index1 > compute->size_peratom_cols)
-            print_var_error(FLERR,"Variable formula compute array is accessed out-of-range",ivar,0);
-          if (!compute->is_initialized())
-            print_var_error(FLERR,"Variable formula compute cannot be invoked before "
-                            "initialization by a run",ivar);
-          if (!(compute->invoked_flag & Compute::INVOKED_PERATOM)) {
-            compute->compute_peratom();
-            compute->invoked_flag |= Compute::INVOKED_PERATOM;
-          }
-
-          auto newtree = new Tree();
-          newtree->type = ATOMARRAY;
-          if (compute->array_atom)
-            newtree->array = &compute->array_atom[0][index1-1];
-          newtree->nstride = compute->size_peratom_cols;
-          treestack[ntreestack++] = newtree;
-
-        } else if (nbracket == 1 && compute->local_flag) {
-          print_var_error(FLERR,"Cannot access local data via indexing",ivar);
-        } else print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
+          } else print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
+        }
 
       // ----------------
       // fix
@@ -1732,7 +1746,6 @@ double Variable::evaluate(char *str, Tree **tree, int ivar)
         if (!fix)
           print_var_error(FLERR,fmt::format("Invalid fix ID '{}' in variable formula",word+2),ivar);
 
-
         // parse zero or one or two trailing brackets
         // point i beyond last bracket
         // nbracket = # of bracket pairs
@@ -1754,181 +1767,200 @@ double Variable::evaluate(char *str, Tree **tree, int ivar)
           }
         }
 
-        // f_ID = scalar from global scalar, must be lowercase
+        // equal-style variable is being evaluated
 
-        if (nbracket == 0 && fix->scalar_flag && lowercase) {
+        if (style[ivar] == EQUAL) {
 
-          if (update->whichflag > 0 && update->ntimestep % fix->global_freq)
-            print_var_error(FLERR,"Fix in variable not computed at a compatible time",ivar);
+          // f_ID = scalar from global scalar
+
+          if (lowercase && nbracket == 0) {
+
+            if (!fix->scalar_flag)
+              print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
+            if (update->whichflag > 0 && update->ntimestep % fix->global_freq)
+              print_var_error(FLERR,"Fix in variable not computed at a compatible time",ivar);
+
+            value1 = fix->compute_scalar();
+            argstack[nargstack++] = value1;
+
+          // f_ID[i] = scalar from global vector
+
+          } else if (lowercase && nbracket == 1) {
+
+            if (!fix->vector_flag)
+              print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
+            if (index1 > fix->size_vector &&
+                fix->size_vector_variable == 0)
+              print_var_error(FLERR,"Variable formula fix vector is accessed out-of-range",ivar,0);
+            if (update->whichflag > 0 && update->ntimestep % fix->global_freq)
+              print_var_error(FLERR,"Fix in variable not computed at a compatible time",ivar);
+
+            value1 = fix->compute_vector(index1-1);
+            argstack[nargstack++] = value1;
+
+          // f_ID[i][j] = scalar from global array
+
+          } else if (lowercase && nbracket == 2) {
+
+            if (!fix->array_flag)
+              print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
+            if (index1 > fix->size_array_rows &&
+                fix->size_array_rows_variable == 0)
+              print_var_error(FLERR,"Variable formula fix array is accessed out-of-range",ivar,0);
+            if (index2 > fix->size_array_cols)
+              print_var_error(FLERR,"Variable formula fix array is accessed out-of-range",ivar,0);
+            if (update->whichflag > 0 && update->ntimestep % fix->global_freq)
+              print_var_error(FLERR,"Fix in variable not computed at a compatible time",ivar);
+
+            value1 = fix->compute_array(index1-1,index2-1);
+            argstack[nargstack++] = value1;
+
+          // F_ID[i] = scalar element of per-atom vector, note uppercase "F"
+
+          } else if (!lowercase && nbracket == 1) {
+
+            if (!fix->peratom_flag)
+              print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
+            if (fix->size_peratom_cols)
+              print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
+            if (update->whichflag > 0 &&
+                update->ntimestep % fix->peratom_freq)
+              print_var_error(FLERR,"Fix in variable not computed at a compatible time",ivar);
+
+            peratom2global(1,nullptr,fix->vector_atom,1,index1,tree,
+                           treestack,ntreestack,argstack,nargstack);
+
+          // F_ID[i][j] = scalar element of per-atom array, note uppercase "F"
+
+          } else if (!lowercase && nbracket == 2) {
+
+            if (!fix->peratom_flag)
+              print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
+            if (!fix->size_peratom_cols)
+              print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
+            if (index2 > fix->size_peratom_cols)
+              print_var_error(FLERR,"Variable formula fix array is accessed out-of-range",ivar,0);
+            if (update->whichflag > 0 && update->ntimestep % fix->peratom_freq)
+              print_var_error(FLERR,"Fix in variable not computed at a compatible time",ivar);
+
+            if (fix->array_atom)
+              peratom2global(1,nullptr,&fix->array_atom[0][index2-1],
+                             fix->size_peratom_cols,index1,
+                             tree,treestack,ntreestack,argstack,nargstack);
+            else
+              peratom2global(1,nullptr,nullptr,fix->size_peratom_cols,index1,
+                             tree,treestack,ntreestack,argstack,nargstack);
+
+          // no other possibilities for equal-style variable, so error
+
+          } else print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
+
+        // vector-style variable is being evaluated
+
+        } else if (style[ivar] == VECTOR) {
+
+          // f_ID = vector from global vector
+
+          if (lowercase && nbracket == 0) {
+
+            if (!fix->vector_flag)
+              print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
+            if (fix->size_vector == 0)
+              print_var_error(FLERR,"Variable formula fix vector is zero length",ivar);
+            if (update->whichflag > 0 && update->ntimestep % fix->global_freq)
+              print_var_error(FLERR,"Fix in variable not computed at compatible time",ivar);
+
+            int nvec = fix->size_vector;
+            double *vec;
+            memory->create(vec,nvec,"variable:values");
+            for (int m = 0; m < nvec; m++)
+              vec[m] = fix->compute_vector(m);
 
-          value1 = fix->compute_scalar();
-          if (tree) {
             auto newtree = new Tree();
-            newtree->type = VALUE;
-            newtree->value = value1;
+            newtree->type = VECTORARRAY;
+            newtree->array = vec;
+            newtree->nvector = nvec;
+            newtree->nstride = 1;
+            newtree->selfalloc = 1;
             treestack[ntreestack++] = newtree;
-          } else argstack[nargstack++] = value1;
 
-        // f_ID[i] = scalar from global vector, must be lowercase
+          // f_ID[i] = vector from global array
 
-        } else if (nbracket == 1 && fix->vector_flag && lowercase) {
+          } else if (lowercase && nbracket == 1) {
 
-          if (index1 > fix->size_vector &&
-              fix->size_vector_variable == 0)
-            print_var_error(FLERR,"Variable formula fix vector is accessed out-of-range",ivar,0);
-          if (update->whichflag > 0 && update->ntimestep % fix->global_freq)
-            print_var_error(FLERR,"Fix in variable not computed at a compatible time",ivar);
+            if (!fix->array_flag)
+              print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
+            if (fix->size_array_rows == 0)
+              print_var_error(FLERR,"Variable formula fix array is zero length",ivar);
+            if (index1 > fix->size_array_cols)
+              print_var_error(FLERR,"Variable formula fix array is accessed out-of-range",ivar,0);
+            if (update->whichflag > 0 && update->ntimestep % fix->global_freq)
+              print_var_error(FLERR,"Fix in variable not computed at a compatible time",ivar);
+
+            int nvec = fix->size_array_rows;
+            double *vec;
+            memory->create(vec,nvec,"variable:values");
+            for (int m = 0; m < nvec; m++)
+              vec[m] = fix->compute_array(m,index1-1);
 
-          value1 = fix->compute_vector(index1-1);
-          if (tree) {
             auto newtree = new Tree();
-            newtree->type = VALUE;
-            newtree->value = value1;
+            newtree->type = VECTORARRAY;
+            newtree->array = vec;
+            newtree->nvector = nvec;
+            newtree->nstride = 1;
+            newtree->selfalloc = 1;
             treestack[ntreestack++] = newtree;
-          } else argstack[nargstack++] = value1;
 
-        // f_ID[i][j] = scalar from global array, must be lowercase
+          // no other possibilities for vector-style variable, so error
 
-        } else if (nbracket == 2 && fix->array_flag && lowercase) {
+          } else print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
 
-          if (index1 > fix->size_array_rows &&
-              fix->size_array_rows_variable == 0)
-            print_var_error(FLERR,"Variable formula fix array is accessed out-of-range",ivar,0);
-          if (index2 > fix->size_array_cols)
-            print_var_error(FLERR,"Variable formula fix array is accessed out-of-range",ivar,0);
-          if (update->whichflag > 0 && update->ntimestep % fix->global_freq)
-            print_var_error(FLERR,"Fix in variable not computed at a compatible time",ivar);
+        // atom-style variable is being evaluated
+
+        } else if (style[ivar] == ATOM) {
+
+          // f_ID = vector from per-atom vector
+
+          if (lowercase && nbracket == 0) {
+
+            if (!fix->peratom_flag)
+              print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
+            if (fix->size_peratom_cols)
+              print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
+            if (update->whichflag > 0 && update->ntimestep % fix->peratom_freq)
+              print_var_error(FLERR,"Fix in variable not computed at compatible time",ivar);
 
-          value1 = fix->compute_array(index1-1,index2-1);
-          if (tree) {
             auto newtree = new Tree();
-            newtree->type = VALUE;
-            newtree->value = value1;
+            newtree->type = ATOMARRAY;
+            newtree->array = fix->vector_atom;
+            newtree->nstride = 1;
             treestack[ntreestack++] = newtree;
-          } else argstack[nargstack++] = value1;
 
-        // f_ID = vector from global vector, lowercase or uppercase
+          // f_ID[i] = vector from per-atom array
 
-        } else if (nbracket == 0 && fix->vector_flag) {
+          } else if (lowercase && nbracket == 1) {
 
-          if (update->whichflag > 0 && update->ntimestep % fix->global_freq)
-            print_var_error(FLERR,"Fix in variable not computed at compatible time",ivar);
-          if (tree == nullptr)
-            print_var_error(FLERR,"Fix global vector in equal-style variable formula",ivar);
-          if (treetype == ATOM)
-            print_var_error(FLERR,"Fix global vector in atom-style variable formula",ivar);
-          if (fix->size_vector == 0)
-            print_var_error(FLERR,"Variable formula fix vector is zero length",ivar);
+            if (!fix->peratom_flag)
+              print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
+            if (!fix->size_peratom_cols)
+              print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
+            if (index1 > fix->size_peratom_cols)
+              print_var_error(FLERR,"Variable formula fix array is accessed out-of-range",ivar,0);
+            if (update->whichflag > 0 && update->ntimestep % fix->peratom_freq)
+              print_var_error(FLERR,"Fix in variable not computed at compatible time",ivar);
 
-          int nvec = fix->size_vector;
-          double *vec;
-          memory->create(vec,nvec,"variable:values");
-          for (int m = 0; m < nvec; m++)
-            vec[m] = fix->compute_vector(m);
+            auto newtree = new Tree();
+            newtree->type = ATOMARRAY;
+            newtree->array = nullptr;
+            if (fix->array_atom)
+              newtree->array = &fix->array_atom[0][index1-1];
+            newtree->nstride = fix->size_peratom_cols;
+            treestack[ntreestack++] = newtree;
 
-          auto newtree = new Tree();
-          newtree->type = VECTORARRAY;
-          newtree->array = vec;
-          newtree->nvector = nvec;
-          newtree->nstride = 1;
-          newtree->selfalloc = 1;
-          treestack[ntreestack++] = newtree;
+          // no other possibilities for atom-style variable, so error
 
-        // f_ID[i] = vector from global array, lowercase or uppercase
-
-        } else if (nbracket == 1 && fix->array_flag) {
-
-          if (update->whichflag > 0 && update->ntimestep % fix->global_freq)
-            print_var_error(FLERR,"Fix in variable not computed at a compatible time",ivar);
-          if (tree == nullptr)
-            print_var_error(FLERR,"Fix global vector in equal-style variable formula",ivar);
-          if (treetype == ATOM)
-            print_var_error(FLERR,"Fix global vector in atom-style variable formula",ivar);
-          if (fix->size_array_rows == 0)
-            print_var_error(FLERR,"Variable formula fix array is zero length",ivar);
-
-          int nvec = fix->size_array_rows;
-          double *vec;
-          memory->create(vec,nvec,"variable:values");
-          for (int m = 0; m < nvec; m++)
-            vec[m] = fix->compute_array(m,index1-1);
-
-          auto newtree = new Tree();
-          newtree->type = VECTORARRAY;
-          newtree->array = vec;
-          newtree->nvector = nvec;
-          newtree->nstride = 1;
-          newtree->selfalloc = 1;
-          treestack[ntreestack++] = newtree;
-
-        // f_ID[i] = scalar from per-atom vector
-
-        } else if (nbracket == 1 && fix->peratom_flag &&
-                   fix->size_peratom_cols == 0) {
-
-          if (update->whichflag > 0 &&
-              update->ntimestep % fix->peratom_freq)
-            print_var_error(FLERR,"Fix in variable not computed at a compatible time",ivar);
-
-          peratom2global(1,nullptr,fix->vector_atom,1,index1,
-                         tree,treestack,ntreestack,argstack,nargstack);
-
-        // f_ID[i][j] = scalar from per-atom array
-
-        } else if (nbracket == 2 && fix->peratom_flag &&
-                   fix->size_peratom_cols > 0) {
-
-          if (index2 > fix->size_peratom_cols)
-            print_var_error(FLERR,"Variable formula fix array is accessed out-of-range",ivar,0);
-          if (update->whichflag > 0 &&
-              update->ntimestep % fix->peratom_freq)
-            print_var_error(FLERR,"Fix in variable not computed at a compatible time",ivar);
-
-          if (fix->array_atom)
-            peratom2global(1,nullptr,&fix->array_atom[0][index2-1],fix->size_peratom_cols,index1,
-                           tree,treestack,ntreestack,argstack,nargstack);
-          else
-            peratom2global(1,nullptr,nullptr,fix->size_peratom_cols,index1,
-                           tree,treestack,ntreestack,argstack,nargstack);
-
-        // f_ID = vector from per-atom vector
-
-        } else if (nbracket == 0 && fix->peratom_flag &&
-                   fix->size_peratom_cols == 0) {
-
-          if (tree == nullptr)
-            print_var_error(FLERR,"Per-atom fix in equal-style variable formula",ivar);
-          if (update->whichflag > 0 &&
-              update->ntimestep % fix->peratom_freq)
-            print_var_error(FLERR,"Fix in variable not computed at compatible time",ivar);
-
-          auto newtree = new Tree();
-          newtree->type = ATOMARRAY;
-          newtree->array = fix->vector_atom;
-          newtree->nstride = 1;
-          treestack[ntreestack++] = newtree;
-
-        // f_ID[i] = vector from per-atom array
-
-        } else if (nbracket == 1 && fix->peratom_flag &&
-                   fix->size_peratom_cols > 0) {
-
-          if (tree == nullptr)
-            print_var_error(FLERR,"Per-atom fix in equal-style variable formula",ivar);
-          if (index1 > fix->size_peratom_cols)
-            print_var_error(FLERR,"Variable formula fix array is accessed out-of-range",ivar,0);
-          if (update->whichflag > 0 &&
-              update->ntimestep % fix->peratom_freq)
-            print_var_error(FLERR,"Fix in variable not computed at compatible time",ivar);
-
-          auto newtree = new Tree();
-          newtree->type = ATOMARRAY;
-          if (fix->array_atom)
-            newtree->array = &fix->array_atom[0][index1-1];
-          newtree->nstride = fix->size_peratom_cols;
-          treestack[ntreestack++] = newtree;
-
-        } else print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
+          } else print_var_error(FLERR,"Mismatched fix in variable formula",ivar);
+        }
 
       // ----------------
       // variable
@@ -1958,124 +1990,140 @@ double Variable::evaluate(char *str, Tree **tree, int ivar)
           i = ptr-str+1;
         }
 
-        // v_name = scalar from internal-style variable
-        // access value directly
+        // vname with no bracket
 
-        if (nbracket == 0 && style[ivar] == INTERNAL) {
+        if (nbracket == 0) {
 
-          value1 = dvalue[ivar];
-          if (tree) {
-            auto newtree = new Tree();
-            newtree->type = VALUE;
-            newtree->value = value1;
-            treestack[ntreestack++] = newtree;
-          } else argstack[nargstack++] = value1;
+          // scalar from internal-style variable
+          // access value directly
 
-        // v_name = scalar from non atom/atomfile & non vector-style variable
-        // access value via retrieve()
+          if (style[ivar] == INTERNAL) {
 
-        } else if (nbracket == 0 && style[ivar] != ATOM &&
-                   style[ivar] != ATOMFILE && style[ivar] != VECTOR) {
+            value1 = dvalue[ivar];
+            if (tree) {
+              auto newtree = new Tree();
+              newtree->type = VALUE;
+              newtree->value = value1;
+              treestack[ntreestack++] = newtree;
+            } else argstack[nargstack++] = value1;
 
-          char *var = retrieve(word+2);
-          if (var == nullptr)
-            print_var_error(FLERR,"Invalid variable evaluation in variable formula",ivar);
-          if (utils::is_double(var)) {
+            // scalar from any style variable except VECTOR, ATOM, ATOMFILE
+            // access value via retrieve()
+
+          } else if (style[ivar] != ATOM && style[ivar] != ATOMFILE && style[ivar] != VECTOR) {
+
+            char *var = retrieve(word+2);
+            if (var == nullptr)
+              print_var_error(FLERR,"Invalid variable evaluation in variable formula",ivar);
+            if (!utils::is_double(var))
+              print_var_error(FLERR,"Non-numeric variable value in variable formula",ivar);
             if (tree) {
               auto newtree = new Tree();
               newtree->type = VALUE;
               newtree->value = atof(var);
               treestack[ntreestack++] = newtree;
             } else argstack[nargstack++] = atof(var);
-          } else print_var_error(FLERR,"Non-numeric variable value in variable formula",ivar);
 
-        // v_name = per-atom vector from atom-style variable
-        // evaluate the atom-style variable as newtree
+          // vector from vector-style variable
+          // evaluate the vector-style variable, put result in newtree
 
-        } else if (nbracket == 0 && style[ivar] == ATOM) {
+          } else if (style[ivar] == VECTOR) {
 
-          if (tree == nullptr)
-            print_var_error(FLERR,"Atom-style variable in equal-style variable formula",ivar);
-          if (treetype == VECTOR)
-            print_var_error(FLERR,"Atom-style variable in vector-style variable formula",ivar);
+            if (tree == nullptr)
+              print_var_error(FLERR,"Vector-style variable in equal-style variable formula",ivar);
+            if (treetype == ATOM)
+              print_var_error(FLERR,"Vector-style variable in atom-style variable formula",ivar);
 
-          Tree *newtree = nullptr;
-          evaluate(data[ivar][0],&newtree,ivar);
-          treestack[ntreestack++] = newtree;
+            double *vec;
+            int nvec = compute_vector(ivar,&vec);
 
-        // v_name = per-atom vector from atomfile-style variable
-
-        } else if (nbracket == 0 && style[ivar] == ATOMFILE) {
-
-          if (tree == nullptr)
-            print_var_error(FLERR,"Atomfile-style variable in equal-style variable formula",ivar);
-          if (treetype == VECTOR)
-            print_var_error(FLERR,"Atomfile-style variable in vector-style variable formula",ivar);
-
-          auto newtree = new Tree();
-          newtree->type = ATOMARRAY;
-          newtree->array = reader[ivar]->fixstore->vstore;
-          newtree->nstride = 1;
-          treestack[ntreestack++] = newtree;
-
-        // v_name = vector from vector-style variable
-        // evaluate the vector-style variable, put result in newtree
-
-        } else if (nbracket == 0 && style[ivar] == VECTOR) {
-
-          if (tree == nullptr)
-            print_var_error(FLERR,"Vector-style variable in equal-style variable formula",ivar);
-          if (treetype == ATOM)
-            print_var_error(FLERR,"Vector-style variable in atom-style variable formula",ivar);
-
-          double *vec;
-          int nvec = compute_vector(ivar,&vec);
-
-          auto newtree = new Tree();
-          newtree->type = VECTORARRAY;
-          newtree->array = vec;
-          newtree->nvector = nvec;
-          newtree->nstride = 1;
-          treestack[ntreestack++] = newtree;
-
-        // v_name[N] = scalar from atom-style variable
-        // compute the per-atom variable in result
-        // use peratom2global to extract single value from result
-
-        } else if (nbracket && style[ivar] == ATOM) {
-
-          double *result;
-          memory->create(result,atom->nlocal,"variable:result");
-          compute_atom(ivar,0,result,1,0);
-          peratom2global(1,nullptr,result,1,index,tree,treestack,ntreestack,argstack,nargstack);
-          memory->destroy(result);
-
-        // v_name[N] = scalar from atomfile-style variable
-
-        } else if (nbracket && style[ivar] == ATOMFILE) {
-
-          peratom2global(1,nullptr,reader[ivar]->fixstore->vstore,1,index,
-                         tree,treestack,ntreestack,argstack,nargstack);
-
-        // v_name[N] = scalar from vector-style variable
-        // compute the vector-style variable, extract single value
-
-        } else if (nbracket && style[ivar] == VECTOR) {
-
-          double *vec;
-          int nvec = compute_vector(ivar,&vec);
-          if (index <= 0 || index > nvec)
-            print_var_error(FLERR,"Invalid index into vector-style variable",ivar);
-          int m = index;   // convert from tagint to int
-
-          if (tree) {
             auto newtree = new Tree();
-            newtree->type = VALUE;
-            newtree->value = vec[m-1];
+            newtree->type = VECTORARRAY;
+            newtree->array = vec;
+            newtree->nvector = nvec;
+            newtree->nstride = 1;
             treestack[ntreestack++] = newtree;
-          } else argstack[nargstack++] = vec[m-1];
 
-        } else print_var_error(FLERR,"Mismatched variable in variable formula",ivar);
+          // vector from atom-style variable
+          // evaluate the atom-style variable as newtree
+
+          } else if (style[ivar] == ATOM) {
+
+            if (tree == nullptr)
+              print_var_error(FLERR,"Atom-style variable in equal-style variable formula",ivar);
+            if (treetype == VECTOR)
+              print_var_error(FLERR,"Atom-style variable in vector-style variable formula",ivar);
+
+            Tree *newtree = nullptr;
+            evaluate(data[ivar][0],&newtree,ivar);
+            treestack[ntreestack++] = newtree;
+
+          // vector from atomfile-style variable
+          // point to the values in FixStore instance
+
+          } else if (style[ivar] == ATOMFILE) {
+
+            if (tree == nullptr)
+              print_var_error(FLERR,"Atomfile-style variable in equal-style variable formula",ivar);
+            if (treetype == VECTOR)
+              print_var_error(FLERR,"Atomfile-style variable in vector-style variable formula",ivar);
+
+            auto newtree = new Tree();
+            newtree->type = ATOMARRAY;
+            newtree->array = reader[ivar]->fixstore->vstore;
+            newtree->nstride = 1;
+            treestack[ntreestack++] = newtree;
+
+          // no other possibilities for variable with no bracket
+
+          } else print_var_error(FLERR,"Mismatched variable in variable formula",ivar);
+
+        // vname[i] with one bracket
+
+        } else if (nbracket == 1) {
+
+          // scalar from vector-style variable
+          // compute the vector-style variable, extract single value
+
+          if (style[ivar] == VECTOR) {
+
+            double *vec;
+            int nvec = compute_vector(ivar,&vec);
+            if (index <= 0 || index > nvec)
+              print_var_error(FLERR,"Invalid index into vector-style variable",ivar);
+            int m = index;   // convert from tagint to int
+
+            if (tree) {
+              auto newtree = new Tree();
+              newtree->type = VALUE;
+              newtree->value = vec[m-1];
+              treestack[ntreestack++] = newtree;
+            } else argstack[nargstack++] = vec[m-1];
+
+          // scalar from atom-style variable
+          // compute the per-atom variable in result
+          // use peratom2global to extract single value from result
+
+          } else if (style[ivar] == ATOM) {
+
+            double *result;
+            memory->create(result,atom->nlocal,"variable:result");
+            compute_atom(ivar,0,result,1,0);
+            peratom2global(1,nullptr,result,1,index,tree,treestack,ntreestack,argstack,nargstack);
+            memory->destroy(result);
+
+          // scalar from atomfile-style variable
+          // use peratom2global to extract single value from FixStore instance
+
+          } else if (style[ivar] == ATOMFILE) {
+
+            peratom2global(1,nullptr,reader[ivar]->fixstore->vstore,1,index,
+                           tree,treestack,ntreestack,argstack,nargstack);
+
+          // no other possibilities for variable with one bracket
+
+          } else print_var_error(FLERR,"Mismatched variable in variable formula",ivar);
+        }
 
       // ----------------
       // math/group/special/labelmap function or atom value/vector or
diff --git a/src/version.h b/src/version.h
index 572a274053..35780aa785 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1,2 +1,2 @@
-#define LAMMPS_VERSION "2 Aug 2023"
+#define LAMMPS_VERSION "3 Aug 2023"
 #define LAMMPS_UPDATE "Development"
diff --git a/tools/lammps-gui/CMakeLists.txt b/tools/lammps-gui/CMakeLists.txt
index 44fc45c0e2..edfeeb1128 100644
--- a/tools/lammps-gui/CMakeLists.txt
+++ b/tools/lammps-gui/CMakeLists.txt
@@ -11,6 +11,42 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 option(LAMMPS_GUI_USE_PLUGIN "Load LAMMPS library dynamically at runtime" OFF)
 mark_as_advanced(LAMMPS_GUI_USE_PLUGIN)
+option(LAMMPS_GUI_USE_QT5 "Prefer using Qt5 over Qt6" OFF)
+
+include(CheckIncludeFileCXX)
+# helper function to check for usable omp.h header
+function(check_omp_h_include)
+  find_package(OpenMP COMPONENTS CXX QUIET)
+  if(OpenMP_CXX_FOUND)
+    set(CMAKE_REQUIRED_FLAGS ${OpenMP_CXX_FLAGS})
+    set(CMAKE_REQUIRED_INCLUDES ${OpenMP_CXX_INCLUDE_DIRS})
+    set(CMAKE_REQUIRED_LINK_OPTIONS ${OpenMP_CXX_FLAGS})
+    set(CMAKE_REQUIRED_LIBRARIES ${OpenMP_CXX_LIBRARIES})
+    check_include_file_cxx(omp.h _have_omp_h)
+  else()
+    set(_have_omp_h FALSE)
+  endif()
+  set(HAVE_OMP_H_INCLUDE ${_have_omp_h} PARENT_SCOPE)
+endfunction()
+
+# detect if we may enable OpenMP support by default
+set(BUILD_OMP_DEFAULT OFF)
+find_package(OpenMP COMPONENTS CXX QUIET)
+if(OpenMP_CXX_FOUND)
+  check_omp_h_include()
+  if(HAVE_OMP_H_INCLUDE)
+    set(BUILD_OMP_DEFAULT ON)
+  endif()
+endif()
+
+option(BUILD_OMP "Build with OpenMP support" ${BUILD_OMP_DEFAULT})
+if(BUILD_OMP)
+  find_package(OpenMP COMPONENTS CXX REQUIRED)
+  check_omp_h_include()
+  if(NOT HAVE_OMP_H_INCLUDE)
+    message(FATAL_ERROR "Cannot find the 'omp.h' header file required for full OpenMP support")
+  endif()
+endif()
 
 # checks
 # when this file is included as subdirectory in the LAMMPS build, many settings are directly imported
@@ -73,7 +109,15 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
 endif()
 
 # we require Qt 5 and at least version 5.12 at that.
-find_package(Qt5 5.12 REQUIRED COMPONENTS Widgets Charts)
+if(NOT LAMMPS_GUI_USE_QT5)
+  find_package(Qt6 6.2 COMPONENTS Widgets Charts)
+endif()
+if(NOT Qt6_FOUND)
+  find_package(Qt5 5.12 REQUIRED COMPONENTS Widgets Charts)
+  set(QT_VERSION_MAJOR "5")
+else()
+  set(QT_VERSION_MAJOR "6")
+endif()
 
 set(PROJECT_SOURCES
   main.cpp
@@ -105,7 +149,11 @@ set(PROJECT_SOURCES
   ${PLUGIN_LOADER_SRC}
   ${ICON_RC_FILE}
 )
-qt5_add_resources(PROJECT_SOURCES lammpsgui.qrc)
+if(QT_VERSION_MAJOR EQUAL 6)
+  qt6_add_resources(PROJECT_SOURCES lammpsgui.qrc)
+else()
+  qt5_add_resources(PROJECT_SOURCES lammpsgui.qrc)
+endif()
 
 if(APPLE)
   set(MACOSX_ICON_FILE ${LAMMPS_DIR}/cmake/packaging/lammps.icns)
@@ -113,10 +161,22 @@ if(APPLE)
   set(MACOSX_BACKGROUND_FILE ${LAMMPS_DIR}/cmake/packaging/LAMMPS_DMG_Background.png)
 endif()
 
-add_executable(lammps-gui
-  ${MACOSX_ICON_FILE}
-  ${PROJECT_SOURCES}
-)
+if(QT_VERSION_MAJOR EQUAL 6)
+  qt_add_executable(lammps-gui
+    MANUAL_FINALIZATION
+    ${MACOSX_ICON_FILE}
+    ${PROJECT_SOURCES}
+  )
+else()
+  add_executable(lammps-gui
+    ${MACOSX_ICON_FILE}
+    ${PROJECT_SOURCES}
+  )
+endif()
+
+if(QT_VERSION_MAJOR EQUAL 6)
+    qt_finalize_executable(lammps-gui)
+endif()
 
 # compilation settings
 if(LAMMPS_GUI_USE_PLUGIN)
@@ -128,7 +188,7 @@ else()
 endif()
 target_include_directories(lammps-gui PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 target_compile_definitions(lammps-gui PRIVATE LAMMPS_GUI_VERSION="${PROJECT_VERSION}")
-target_link_libraries(lammps-gui PRIVATE Qt5::Widgets Qt5::Charts)
+target_link_libraries(lammps-gui PRIVATE Qt${QT_VERSION_MAJOR}::Widgets Qt${VERSION_MAJOR}::Charts)
 if(BUILD_OMP)
   find_package(OpenMP COMPONENTS CXX REQUIRED)
   target_link_libraries(lammps-gui PRIVATE OpenMP::OpenMP_CXX)
@@ -209,7 +269,7 @@ elseif((CMAKE_SYSTEM_NAME STREQUAL "Windows") AND CMAKE_CROSSCOMPILING)
     COMMENT "Create zip file with windows binaries"
     BYPRODUCT LAMMPS-Win10-amd64.zip
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
-elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+elseif((CMAKE_SYSTEM_NAME STREQUAL "Linux") AND NOT LAMMPS_GUI_USE_PLUGIN)
   install(TARGETS lammps-gui DESTINATION ${CMAKE_INSTALL_BINDIR})
   install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/lammps-gui.desktop DESTINATION ${CMAKE_INSTALL_DATADIR}/applications/)
   install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/lammps-input.xml DESTINATION ${CMAKE_INSTALL_DATADIR}/mime/packages/)
diff --git a/tools/lammps-gui/TODO.md b/tools/lammps-gui/TODO.md
index e4ca44ba3d..ee05e67225 100644
--- a/tools/lammps-gui/TODO.md
+++ b/tools/lammps-gui/TODO.md
@@ -23,7 +23,6 @@ LAMMPS-GUI TODO list:
 
 # Long term ideas (v2.x)
 - rewrite entire application to build the App and its layout manually
-- port to Qt6 (with compatibility to Qt5?)
 - also a rewrite should establish consistent naming conventions. now we have a mix of LAMMPS style, Qt style, and others.
 - add option to attach a debugger to the running program (highly non-portable, need customization support in preferences)
 - write a "wizard" dialog that can be used for beginners to create an input file template for a few typical use scenarios
diff --git a/tools/lammps-gui/chartviewer.cpp b/tools/lammps-gui/chartviewer.cpp
index f28625d9dc..fbd888f1cd 100644
--- a/tools/lammps-gui/chartviewer.cpp
+++ b/tools/lammps-gui/chartviewer.cpp
@@ -15,11 +15,20 @@
 
 #include "lammpsgui.h"
 
+#include <QAction>
+#include <QApplication>
+#include <QFileDialog>
 #include <QHBoxLayout>
+#include <QKeySequence>
+#include <QLabel>
+#include <QLayout>
 #include <QLineSeries>
+#include <QMenu>
+#include <QMenuBar>
 #include <QPushButton>
 #include <QSettings>
 #include <QSpacerItem>
+#include <QTextStream>
 #include <QVBoxLayout>
 
 using namespace QtCharts;
@@ -53,13 +62,13 @@ ChartWindow::ChartWindow(const QString &_filename, QWidget *parent) :
     file->addSeparator();
     stopAct = file->addAction("Stop &Run", this, &ChartWindow::stop_run);
     stopAct->setIcon(QIcon(":/icons/process-stop.png"));
-    stopAct->setShortcut(QKeySequence::fromString("Ctrl+/"));
+    stopAct->setShortcut(QKeySequence(Qt::CTRL | Qt::Key_Slash));
     closeAct = file->addAction("&Close", this, &QWidget::close);
     closeAct->setIcon(QIcon(":/icons/window-close.png"));
-    closeAct->setShortcut(QKeySequence::fromString("Ctrl+W"));
+    closeAct->setShortcut(QKeySequence(Qt::CTRL | Qt::Key_W));
     quitAct = file->addAction("&Quit", this, &ChartWindow::quit);
     quitAct->setIcon(QIcon(":/icons/application-exit.png"));
-    quitAct->setShortcut(QKeySequence::fromString("Ctrl+Q"));
+    quitAct->setShortcut(QKeySequence(Qt::CTRL | Qt::Key_Q));
     auto *layout = new QVBoxLayout;
     layout->addLayout(top);
     setLayout(layout);
@@ -76,7 +85,10 @@ int ChartWindow::get_step() const
 {
     if (charts.size() > 0) {
         auto *v = charts[0];
-        return (int)v->get_step(v->get_count() - 1);
+        if (v)
+          return (int)v->get_step(v->get_count() - 1);
+        else
+          return -1;
     } else {
         return -1;
     }
@@ -115,10 +127,10 @@ void ChartWindow::add_data(int step, double data, int index)
 
 void ChartWindow::quit()
 {
-    LammpsGui *main;
+    LammpsGui *main = nullptr;
     for (QWidget *widget : QApplication::topLevelWidgets())
         if (widget->objectName() == "LammpsGui") main = dynamic_cast<LammpsGui *>(widget);
-    main->quit();
+    if (main) main->quit();
 }
 
 void ChartWindow::reset_zoom()
@@ -129,10 +141,10 @@ void ChartWindow::reset_zoom()
 
 void ChartWindow::stop_run()
 {
-    LammpsGui *main;
+    LammpsGui *main = nullptr;
     for (QWidget *widget : QApplication::topLevelWidgets())
         if (widget->objectName() == "LammpsGui") main = dynamic_cast<LammpsGui *>(widget);
-    main->stop_run();
+    if (main) main->stop_run();
 }
 
 void ChartWindow::saveAs()
@@ -288,7 +300,7 @@ void ChartViewer::add_data(int step, double data)
     if (last_step < step) {
         last_step = step;
         series->append(step, data);
-        auto points = series->pointsVector();
+        auto points = series->points();
 
         qreal xmin = 1.0e100;
         qreal xmax = -1.0e100;
@@ -309,7 +321,7 @@ void ChartViewer::add_data(int step, double data)
 
 void ChartViewer::reset_zoom()
 {
-    auto points = series->pointsVector();
+    auto points = series->points();
 
     qreal xmin = 1.0e100;
     qreal xmax = -1.0e100;
diff --git a/tools/lammps-gui/chartviewer.h b/tools/lammps-gui/chartviewer.h
index 248fdad7bb..da0468eaf8 100644
--- a/tools/lammps-gui/chartviewer.h
+++ b/tools/lammps-gui/chartviewer.h
@@ -14,16 +14,17 @@
 #ifndef CHARTVIEWER_H
 #define CHARTVIEWER_H
 
+#include <QComboBox>
 #include <QList>
 #include <QString>
 #include <QWidget>
-#include <QtCharts>
 
 class QAction;
 class QMenuBar;
 class QMenu;
-class QComboBox;
+namespace QtCharts {
 class ChartViewer;
+}
 
 class ChartWindow : public QWidget {
     Q_OBJECT
@@ -64,12 +65,18 @@ private:
     QAction *closeAct, *stopAct, *quitAct;
 
     QString filename;
-    QList<ChartViewer *> charts;
+    QList<QtCharts::ChartViewer *> charts;
 };
 
 /* -------------------------------------------------------------------- */
 
-class ChartViewer : public QtCharts::QChartView {
+#include <QChart>
+#include <QChartView>
+#include <QLineSeries>
+#include <QValueAxis>
+
+namespace QtCharts {
+class ChartViewer : public QChartView {
     Q_OBJECT
 
 public:
@@ -81,16 +88,17 @@ public:
     int get_index() const { return index; };
     int get_count() const { return series->count(); }
     const char *get_title() const { return series->name().toLocal8Bit(); }
-    double get_step(int index) const { return series->at(index).x(); }
-    double get_data(int index) const { return series->at(index).y(); }
+    double get_step(int index) const { return (index < 0) ? 0.0 : series->at(index).x(); }
+    double get_data(int index) const { return (index < 0) ? 0.0 : series->at(index).y(); }
 
 private:
     int last_step, index;
-    QtCharts::QChart *chart;
-    QtCharts::QLineSeries *series;
-    QtCharts::QValueAxis *xaxis;
-    QtCharts::QValueAxis *yaxis;
+    QChart *chart;
+    QLineSeries *series;
+    QValueAxis *xaxis;
+    QValueAxis *yaxis;
 };
+} // namespace QtCharts
 #endif
 
 // Local Variables:
diff --git a/tools/lammps-gui/codeeditor.cpp b/tools/lammps-gui/codeeditor.cpp
index 34193bc320..e95f576be0 100644
--- a/tools/lammps-gui/codeeditor.cpp
+++ b/tools/lammps-gui/codeeditor.cpp
@@ -564,14 +564,16 @@ void CodeEditor::keyPressEvent(QKeyEvent *event)
     // process key event in parent class
     QPlainTextEdit::keyPressEvent(event);
 
-    // if enabled, try pop up completion automatically after 3 characters
+    // if enabled, try pop up completion automatically after 2 characters
     if (automatic_completion) {
         auto cursor = textCursor();
         auto line   = cursor.block().text();
+        if (line.isEmpty()) return;
 
         // QTextCursor::WordUnderCursor is unusable here since recognizes '/' as word boundary.
         // Work around it by manually searching for the location of the beginning of the word.
-        int begin = cursor.positionInBlock();
+        int begin = qMin(cursor.positionInBlock(), line.length() - 1);
+
         while (begin >= 0) {
             if (line[begin].isSpace()) break;
             --begin;
@@ -748,7 +750,7 @@ void CodeEditor::runCompletion()
     // QTextCursor::WordUnderCursor is unusable here since it recognizes '/' as word boundary.
     // Work around it by manually searching for the beginning and end position of the word
     // under the cursor and then using that substring.
-    int begin = cursor.positionInBlock();
+    int begin = qMin(cursor.positionInBlock(), line.length() - 1);
     line      = cursor.block().text();
     while (begin >= 0) {
         if (line[begin].isSpace()) break;
@@ -990,8 +992,26 @@ void CodeEditor::insertCompletedCommand(const QString &completion)
 {
     auto *completer = qobject_cast<QCompleter *>(sender());
     if (completer->widget() != this) return;
+
+    // select the entire word (non-space text) under the cursor
+    // we need to do it in this compicated way, since QTextCursor does not recognize
+    // special characters as part of a word.
     auto cursor = textCursor();
-    cursor.movePosition(QTextCursor::StartOfWord, QTextCursor::KeepAnchor);
+    auto line   = cursor.block().text();
+    int begin   = cursor.positionInBlock();
+    do {
+        if (line[begin].isSpace()) break;
+        --begin;
+    } while (begin >= 0);
+
+    int end = begin + 1;
+    while (end < line.length()) {
+        if (line[end].isSpace()) break;
+        ++end;
+    }
+
+    cursor.setPosition(cursor.position() - cursor.positionInBlock() + begin + 1);
+    cursor.movePosition(QTextCursor::NextCharacter, QTextCursor::KeepAnchor, end - begin - 1);
     cursor.insertText(completion);
     setTextCursor(cursor);
 }
diff --git a/tools/lammps-gui/imageviewer.cpp b/tools/lammps-gui/imageviewer.cpp
index 86be0b66df..00b08f3f47 100644
--- a/tools/lammps-gui/imageviewer.cpp
+++ b/tools/lammps-gui/imageviewer.cpp
@@ -23,6 +23,7 @@
 #include <QGuiApplication>
 #include <QImage>
 #include <QImageReader>
+#include <QKeySequence>
 #include <QLabel>
 #include <QLineEdit>
 #include <QMenuBar>
@@ -131,7 +132,7 @@ static const QString blank(" ");
 
 ImageViewer::ImageViewer(const QString &fileName, LammpsWrapper *_lammps, QWidget *parent) :
     QDialog(parent), menuBar(new QMenuBar), imageLabel(new QLabel), scrollArea(new QScrollArea),
-    lammps(_lammps), group("all"), filename(fileName), useelements(false)
+    lammps(_lammps), group("all"), filename(fileName), useelements(false), usediameter(false)
 {
     imageLabel->setBackgroundRole(QPalette::Base);
     imageLabel->setSizePolicy(QSizePolicy::Ignored, QSizePolicy::Ignored);
@@ -268,7 +269,7 @@ ImageViewer::ImageViewer(const QString &fileName, LammpsWrapper *_lammps, QWidge
     // properties directly since lookup in reset_view() will have failed
     dobox->setChecked(showbox);
     dovdw->setChecked(vdwfactor > 1.0);
-    dovdw->setEnabled(useelements);
+    dovdw->setEnabled(useelements || usediameter);
     doaxes->setChecked(showaxes);
     dossao->setChecked(usessao);
     doanti->setChecked(antialias);
@@ -435,7 +436,7 @@ void ImageViewer::createImage()
     dumpcmd += "'" + dumpfile.fileName() + "'";
 
     settings.beginGroup("snapshot");
-    int hhrot    = (hrot > 180) ? 360 - hrot : hrot;
+    int hhrot = (hrot > 180) ? 360 - hrot : hrot;
 
     // determine elements from masses and set their covalent radii
     int ntypes       = lammps->extract_setting("ntypes");
@@ -454,9 +455,10 @@ void ImageViewer::createImage()
             adiams += QString("adiam %1 %2 ").arg(i).arg(vdwfactor * pte_vdw_radius[idx]);
         }
     }
+    usediameter = lammps->extract_setting("radius_flag") != 0;
 
     // adjust pushbutton state and clear adiams string to disable VDW display, if needed
-    if (useelements) {
+    if (useelements || usediameter) {
         auto *button = findChild<QPushButton *>("vdw");
         if (button) button->setEnabled(true);
     } else {
@@ -469,7 +471,10 @@ void ImageViewer::createImage()
         dumpcmd += blank + "element";
     else
         dumpcmd += blank + settings.value("color", "type").toString();
-    dumpcmd += blank + settings.value("diameter", "type").toString();
+    if (usediameter && (vdwfactor > 1.0))
+        dumpcmd += blank + "diameter";
+    else
+        dumpcmd += blank + settings.value("diameter", "type").toString();
     dumpcmd += QString(" size %1 %2").arg(xsize).arg(ysize);
     dumpcmd += QString(" zoom %1").arg(zoom);
     dumpcmd += " shiny 0.5 ";
@@ -528,10 +533,10 @@ void ImageViewer::copy() {}
 
 void ImageViewer::quit()
 {
-    LammpsGui *main;
+    LammpsGui *main = nullptr;
     for (QWidget *widget : QApplication::topLevelWidgets())
         if (widget->objectName() == "LammpsGui") main = dynamic_cast<LammpsGui *>(widget);
-    main->quit();
+    if (main) main->quit();
 }
 
 void ImageViewer::saveFile(const QString &fileName)
@@ -554,10 +559,10 @@ void ImageViewer::createActions()
     fileMenu->addSeparator();
     QAction *exitAct = fileMenu->addAction("&Close", this, &QWidget::close);
     exitAct->setIcon(QIcon(":/icons/window-close.png"));
-    exitAct->setShortcut(QKeySequence::fromString("Ctrl+W"));
+    exitAct->setShortcut(QKeySequence(Qt::CTRL | Qt::Key_W));
     QAction *quitAct = fileMenu->addAction("&Quit", this, &ImageViewer::quit);
     quitAct->setIcon(QIcon(":/icons/application-exit.png"));
-    quitAct->setShortcut(QKeySequence::fromString("Ctrl+Q"));
+    quitAct->setShortcut(QKeySequence(Qt::CTRL | Qt::Key_Q));
 }
 
 void ImageViewer::updateActions()
diff --git a/tools/lammps-gui/imageviewer.h b/tools/lammps-gui/imageviewer.h
index 8946c6cc8b..1be7790666 100644
--- a/tools/lammps-gui/imageviewer.h
+++ b/tools/lammps-gui/imageviewer.h
@@ -88,7 +88,7 @@ private:
     int xsize, ysize;
     int hrot, vrot;
     double zoom, vdwfactor;
-    bool showbox, showaxes, antialias, usessao, useelements;
+    bool showbox, showaxes, antialias, usessao, useelements, usediameter;
 };
 #endif
 
diff --git a/tools/lammps-gui/lammpsgui.cpp b/tools/lammps-gui/lammpsgui.cpp
index ba080dbec3..11f2554b55 100644
--- a/tools/lammps-gui/lammpsgui.cpp
+++ b/tools/lammps-gui/lammpsgui.cpp
@@ -35,6 +35,7 @@
 #include <QLabel>
 #include <QLocale>
 #include <QMessageBox>
+#include <QMetaType>
 #include <QPlainTextEdit>
 #include <QProcess>
 #include <QProgressBar>
@@ -69,8 +70,10 @@ LammpsGui::LammpsGui(QWidget *parent, const char *filename) :
     // enforce using the plain ASCII C locale within the GUI.
     QLocale::setDefault(QLocale("C"));
 
-    // register QList<QString>
+#if QT_VERSION < QT_VERSION_CHECK(6, 0, 0)
+    // register QList<QString> only needed for Qt5
     qRegisterMetaTypeStreamOperators<QList<QString>>("QList<QString>");
+#endif
 
     ui->setupUi(this);
     this->setCentralWidget(ui->textEdit);
@@ -81,9 +84,13 @@ LammpsGui::LammpsGui(QWidget *parent, const char *filename) :
     // use $HOME if we get dropped to "/" like on macOS
     if (current_dir == "/") current_dir = QDir::homePath();
 
+#define stringify(x) myxstr(x)
+#define myxstr(x) #x
     QCoreApplication::setOrganizationName("The LAMMPS Developers");
     QCoreApplication::setOrganizationDomain("lammps.org");
-    QCoreApplication::setApplicationName("LAMMPS GUI");
+    QCoreApplication::setApplicationName("LAMMPS GUI - QT" stringify(QT_VERSION_MAJOR));
+#undef stringify
+#undef myxstr
 
     // restore and initialize settings
     QSettings settings;
@@ -588,7 +595,8 @@ void LammpsGui::open_file(const QString &fileName)
     if (!file.open(QIODevice::ReadOnly | QFile::Text)) {
         QMessageBox::warning(this, "Warning",
                              "Cannot open file " + path.absoluteFilePath() + ": " +
-                                 file.errorString() + ".\nWill create new file on saving editor buffer.");
+                                 file.errorString() +
+                                 ".\nWill create new file on saving editor buffer.");
         ui->textEdit->document()->setPlainText(QString());
     } else {
         QTextStream in(&file);
@@ -1039,9 +1047,9 @@ void LammpsGui::do_run(bool use_buffer)
     logwindow->document()->setDefaultFont(text_font);
     logwindow->setLineWrapMode(LogWindow::NoWrap);
     logwindow->setMinimumSize(400, 300);
-    QShortcut *shortcut = new QShortcut(QKeySequence(Qt::CTRL + Qt::Key_W), logwindow);
+    QShortcut *shortcut = new QShortcut(QKeySequence(Qt::CTRL | Qt::Key_W), logwindow);
     QObject::connect(shortcut, &QShortcut::activated, logwindow, &LogWindow::close);
-    shortcut = new QShortcut(QKeySequence(Qt::CTRL + Qt::Key_Slash), logwindow);
+    shortcut = new QShortcut(QKeySequence(Qt::CTRL | Qt::Key_Slash), logwindow);
     QObject::connect(shortcut, &QShortcut::activated, this, &LammpsGui::stop_run);
     if (settings.value("viewlog", true).toBool())
         logwindow->show();
@@ -1058,9 +1066,9 @@ void LammpsGui::do_run(bool use_buffer)
             .arg(run_counter));
     chartwindow->setWindowIcon(QIcon(":/icons/lammps-icon-128x128.png"));
     chartwindow->setMinimumSize(400, 300);
-    shortcut = new QShortcut(QKeySequence(Qt::CTRL + Qt::Key_W), chartwindow);
+    shortcut = new QShortcut(QKeySequence(Qt::CTRL | Qt::Key_W), chartwindow);
     QObject::connect(shortcut, &QShortcut::activated, chartwindow, &ChartWindow::close);
-    shortcut = new QShortcut(QKeySequence(Qt::CTRL + Qt::Key_Slash), chartwindow);
+    shortcut = new QShortcut(QKeySequence(Qt::CTRL | Qt::Key_Slash), chartwindow);
     QObject::connect(shortcut, &QShortcut::activated, this, &LammpsGui::stop_run);
     if (settings.value("viewchart", true).toBool())
         chartwindow->show();
diff --git a/tools/lammps-gui/lammpsgui.h b/tools/lammps-gui/lammpsgui.h
index 0c622f0285..0dd34f2c49 100644
--- a/tools/lammps-gui/lammpsgui.h
+++ b/tools/lammps-gui/lammpsgui.h
@@ -16,8 +16,10 @@
 
 #include <QMainWindow>
 
+#include <QGridLayout>
 #include <QList>
 #include <QPair>
+#include <QSpacerItem>
 #include <QString>
 #include <vector>
 
diff --git a/tools/lammps-gui/logwindow.cpp b/tools/lammps-gui/logwindow.cpp
index ab1886f1bd..73ec81d06c 100644
--- a/tools/lammps-gui/logwindow.cpp
+++ b/tools/lammps-gui/logwindow.cpp
@@ -35,12 +35,14 @@ LogWindow::LogWindow(const QString &_filename, QWidget *parent) :
     QSettings settings;
     resize(settings.value("logx", 500).toInt(), settings.value("logy", 320).toInt());
 
-    auto action = new QShortcut(QKeySequence::fromString("Ctrl+S"), this);
+    auto action = new QShortcut(QKeySequence(Qt::CTRL | Qt::Key_S), this);
     connect(action, &QShortcut::activated, this, &LogWindow::save_as);
-    action = new QShortcut(QKeySequence::fromString("Ctrl+Q"), this);
+    action = new QShortcut(QKeySequence(Qt::CTRL | Qt::Key_Q), this);
     connect(action, &QShortcut::activated, this, &LogWindow::quit);
-    action = new QShortcut(QKeySequence(Qt::Key_Slash, Qt::CTRL), this);
+    action = new QShortcut(QKeySequence(Qt::CTRL | Qt::Key_Slash), this);
     connect(action, &QShortcut::activated, this, &LogWindow::stop_run);
+
+    installEventFilter(this);
 }
 
 void LogWindow::closeEvent(QCloseEvent *event)
@@ -55,18 +57,18 @@ void LogWindow::closeEvent(QCloseEvent *event)
 
 void LogWindow::quit()
 {
-    LammpsGui *main;
+    LammpsGui *main = nullptr;
     for (QWidget *widget : QApplication::topLevelWidgets())
         if (widget->objectName() == "LammpsGui") main = dynamic_cast<LammpsGui *>(widget);
-    main->quit();
+    if (main) main->quit();
 }
 
 void LogWindow::stop_run()
 {
-    LammpsGui *main;
+    LammpsGui *main = nullptr;
     for (QWidget *widget : QApplication::topLevelWidgets())
         if (widget->objectName() == "LammpsGui") main = dynamic_cast<LammpsGui *>(widget);
-    main->stop_run();
+    if (main) main->stop_run();
 }
 
 void LogWindow::save_as()
@@ -99,15 +101,35 @@ void LogWindow::contextMenuEvent(QContextMenuEvent *event)
     menu->addSeparator();
     auto action = menu->addAction(QString("Save Log to File ..."));
     action->setIcon(QIcon(":/icons/document-save-as.png"));
-    action->setShortcut(QKeySequence::fromString("Ctrl+S"));
+    action->setShortcut(QKeySequence(Qt::CTRL | Qt::Key_S));
     connect(action, &QAction::triggered, this, &LogWindow::save_as);
     action = menu->addAction("&Close Window", this, &QWidget::close);
     action->setIcon(QIcon(":/icons/window-close.png"));
-    action->setShortcut(QKeySequence::fromString("Ctrl+W"));
+    action->setShortcut(QKeySequence(Qt::CTRL | Qt::Key_W));
     menu->exec(event->globalPos());
     delete menu;
 }
 
+// event filter to handle "Ambiguous shortcut override" issues
+bool LogWindow::eventFilter(QObject *watched, QEvent *event)
+{
+    if (event->type() == QEvent::ShortcutOverride) {
+        QKeyEvent *keyEvent = dynamic_cast<QKeyEvent *>(event);
+        if (!keyEvent) return QWidget::eventFilter(watched, event);
+        if (keyEvent->modifiers().testFlag(Qt::ControlModifier) && keyEvent->key() == '/') {
+            stop_run();
+            event->accept();
+            return true;
+        }
+        if (keyEvent->modifiers().testFlag(Qt::ControlModifier) && keyEvent->key() == 'W') {
+            close();
+            event->accept();
+            return true;
+        }
+    }
+    return QWidget::eventFilter(watched, event);
+}
+
 // Local Variables:
 // c-basic-offset: 4
 // End:
diff --git a/tools/lammps-gui/logwindow.h b/tools/lammps-gui/logwindow.h
index 8923e35ee5..ad0691d0cc 100644
--- a/tools/lammps-gui/logwindow.h
+++ b/tools/lammps-gui/logwindow.h
@@ -30,6 +30,7 @@ private slots:
 protected:
     void closeEvent(QCloseEvent *event) override;
     void contextMenuEvent(QContextMenuEvent *event) override;
+    bool eventFilter(QObject *watched, QEvent *event) override;
 
 private:
     QString filename;
diff --git a/tools/lammps-gui/preferences.cpp b/tools/lammps-gui/preferences.cpp
index c760e6610b..fd01bb5046 100644
--- a/tools/lammps-gui/preferences.cpp
+++ b/tools/lammps-gui/preferences.cpp
@@ -286,12 +286,12 @@ GeneralTab::GeneralTab(QSettings *_settings, LammpsWrapper *_lammps, QWidget *pa
 
 void GeneralTab::updatefonts(const QFont &all, const QFont &text)
 {
-    LammpsGui *main;
+    LammpsGui *main = nullptr;
     for (QWidget *widget : QApplication::topLevelWidgets())
         if (widget->objectName() == "LammpsGui") main = dynamic_cast<LammpsGui *>(widget);
 
     QApplication::setFont(all);
-    main->ui->textEdit->document()->setDefaultFont(text);
+    if (main) main->ui->textEdit->document()->setDefaultFont(text);
 }
 
 void GeneralTab::newallfont()
@@ -410,11 +410,19 @@ AcceleratorTab::AcceleratorTab(QSettings *_settings, LammpsWrapper *_lammps, QWi
 #endif
     auto *choices      = new QFrame;
     auto *choiceLayout = new QVBoxLayout;
+#if defined(_OPENMP)
     auto *ntlabel      = new QLabel(QString("Number of threads (max %1):").arg(maxthreads));
     auto *ntchoice     = new QLineEdit(settings->value("nthreads", maxthreads).toString());
+#else
+    auto *ntlabel      = new QLabel(QString("Number of threads (OpenMP not available):"));
+    auto *ntchoice     = new QLineEdit("1");
+#endif
     auto *intval       = new QIntValidator(1, maxthreads, this);
     ntchoice->setValidator(intval);
     ntchoice->setObjectName("nthreads");
+#if !defined(_OPENMP)
+    ntchoice->setEnabled(false);
+#endif
 
     choiceLayout->addWidget(ntlabel);
     choiceLayout->addWidget(ntchoice);
diff --git a/tools/lammps-gui/slideshow.cpp b/tools/lammps-gui/slideshow.cpp
index de7742f22f..140c703ca3 100644
--- a/tools/lammps-gui/slideshow.cpp
+++ b/tools/lammps-gui/slideshow.cpp
@@ -25,6 +25,7 @@
 #include <QHBoxLayout>
 #include <QImage>
 #include <QImageReader>
+#include <QKeySequence>
 #include <QLabel>
 #include <QPalette>
 #include <QProcess>
@@ -50,11 +51,11 @@ SlideShow::SlideShow(const QString &fileName, QWidget *parent) :
     imageName->setAlignment(Qt::AlignCenter);
     imageName->setSizePolicy(QSizePolicy::Expanding, QSizePolicy::Expanding);
 
-    auto *shortcut = new QShortcut(QKeySequence::fromString("Ctrl+W"), this);
+    auto *shortcut = new QShortcut(QKeySequence(Qt::CTRL | Qt::Key_W), this);
     QObject::connect(shortcut, &QShortcut::activated, this, &QWidget::close);
-    shortcut = new QShortcut(QKeySequence::fromString("Ctrl+/"), this);
+    shortcut = new QShortcut(QKeySequence(Qt::CTRL | Qt::Key_Slash), this);
     QObject::connect(shortcut, &QShortcut::activated, this, &SlideShow::stop_run);
-    shortcut = new QShortcut(QKeySequence::fromString("Ctrl+Q"), this);
+    shortcut = new QShortcut(QKeySequence(Qt::CTRL | Qt::Key_Q), this);
     QObject::connect(shortcut, &QShortcut::activated, this, &SlideShow::quit);
 
     buttonBox = new QDialogButtonBox(QDialogButtonBox::Close);
@@ -198,18 +199,18 @@ void SlideShow::loadImage(int idx)
 
 void SlideShow::quit()
 {
-    LammpsGui *main;
+    LammpsGui *main = nullptr;
     for (QWidget *widget : QApplication::topLevelWidgets())
         if (widget->objectName() == "LammpsGui") main = dynamic_cast<LammpsGui *>(widget);
-    main->quit();
+    if (main) main->quit();
 }
 
 void SlideShow::stop_run()
 {
-    LammpsGui *main;
+    LammpsGui *main = nullptr;
     for (QWidget *widget : QApplication::topLevelWidgets())
         if (widget->objectName() == "LammpsGui") main = dynamic_cast<LammpsGui *>(widget);
-    main->stop_run();
+    if (main) main->stop_run();
 }
 
 void SlideShow::movie()
diff --git a/tools/lammps-gui/stdcapture.cpp b/tools/lammps-gui/stdcapture.cpp
index 428277cc10..b09aebf053 100644
--- a/tools/lammps-gui/stdcapture.cpp
+++ b/tools/lammps-gui/stdcapture.cpp
@@ -77,6 +77,7 @@ bool StdCapture::EndCapture()
 
     int bytesRead;
     bool fd_blocked;
+    int maxwait = 100;
 
     do {
         bytesRead  = 0;
@@ -93,9 +94,10 @@ bool StdCapture::EndCapture()
             buf[bytesRead] = 0;
             m_captured += buf;
         } else if (bytesRead < 0) {
-            fd_blocked = ((errno == EAGAIN) || (errno == EWOULDBLOCK) || (errno == EINTR));
+            fd_blocked = ((errno == EAGAIN) || (errno == EWOULDBLOCK) || (errno == EINTR)) && (maxwait > 0);
 
             if (fd_blocked) std::this_thread::sleep_for(std::chrono::milliseconds(10));
+            --maxwait;
         }
     } while (fd_blocked || (bytesRead == (bufSize - 1)));
     m_capturing = false;
diff --git a/unittest/formats/CMakeLists.txt b/unittest/formats/CMakeLists.txt
index 93ea2f3b32..58c797b6e6 100644
--- a/unittest/formats/CMakeLists.txt
+++ b/unittest/formats/CMakeLists.txt
@@ -41,6 +41,8 @@ set_tests_properties(TextFileReader PROPERTIES ENVIRONMENT "LAMMPS_POTENTIALS=${
 add_executable(test_file_operations test_file_operations.cpp)
 target_link_libraries(test_file_operations PRIVATE lammps GTest::GMock)
 add_test(NAME FileOperations COMMAND test_file_operations)
+# try to mitigate possible OpenMPI bug
+set_tests_properties(TextFileReader PROPERTIES ENVIRONMENT "OMPI_MCA_sharedfp=\"^sm\"")
 
 add_executable(test_dump_atom test_dump_atom.cpp)
 target_link_libraries(test_dump_atom PRIVATE lammps GTest::GMock)