diff --git a/doc/src/Python_properties.rst b/doc/src/Python_properties.rst
index d8e772379c..031461660a 100644
--- a/doc/src/Python_properties.rst
+++ b/doc/src/Python_properties.rst
@@ -53,6 +53,7 @@ against invalid accesses.
 
       * :py:meth:`version() <lammps.lammps.version()>`: return the numerical version id, e.g. LAMMPS 2 Sep 2015 -> 20150902
       * :py:meth:`get_thermo() <lammps.lammps.get_thermo()>`: return current value of a thermo keyword
+      * :py:meth:`last_thermo() <lammps.lammps.last_thermo()>`: return a dictionary of the last thermodynamic output
       * :py:meth:`get_natoms() <lammps.lammps.get_natoms()>`: total # of atoms as int
       * :py:meth:`reset_box() <lammps.lammps.reset_box()>`: reset the simulation box size
       * :py:meth:`extract_setting() <lammps.lammps.extract_setting()>`: return a global setting
@@ -60,6 +61,10 @@ against invalid accesses.
       * :py:meth:`extract_box() <lammps.lammps.extract_box()>`: extract box info
       * :py:meth:`create_atoms() <lammps.lammps.create_atoms()>`: create N atoms with IDs, types, x, v, and image flags
 
+      **Properties**:
+
+      * :py:attr:`last_thermo_step <lammps.lammps.last_thermo_step>`: the last timestep thermodynamic output was computed
+
    .. tab:: PyLammps/IPyLammps API
 
       In addition to the functions provided by :py:class:`lammps <lammps.lammps>`, :py:class:`PyLammps <lammps.PyLammps>` objects
diff --git a/doc/src/package.rst b/doc/src/package.rst
index 6d425b63dd..b0549ba7d9 100644
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@@ -49,7 +49,7 @@ Syntax
        *intel* args = NPhi keyword value ...
          Nphi = # of co-processors per node
          zero or more keyword/value pairs may be appended
-         keywords = *mode* or *omp* or *lrt* or *balance* or *ghost* or *tpc* or *tptask* or *no_affinity*
+         keywords = *mode* or *omp* or *lrt* or *balance* or *ghost* or *tpc* or *tptask* or *pppm_table* or *no_affinity*
            *mode* value = *single* or *mixed* or *double*
              single = perform force calculations in single precision
              mixed = perform force calculations in mixed precision
@@ -68,6 +68,9 @@ Syntax
              Ntpc = max number of co-processor threads per co-processor core (default = 4)
            *tptask* value = Ntptask
              Ntptask = max number of co-processor threads per MPI task (default = 240)
+           *pppm_table* value = *yes* or *no*
+             *yes* = Precompute pppm values in table (doesn't change accuracy)
+             *no* = Compute pppm values on the fly
            *no_affinity* values = none
        *kokkos* args = keyword value ...
          zero or more keyword/value pairs may be appended
@@ -392,13 +395,13 @@ is identical to the default *verlet* style aside from supporting the
 LRT feature. This feature requires setting the pre-processor flag
 -DLMP_INTEL_USELRT in the makefile when compiling LAMMPS.
 
-The *balance* keyword sets the fraction of :doc:`pair style <pair_style>` work offloaded to the co-processor for split
-values between 0.0 and 1.0 inclusive.  While this fraction of work is
-running on the co-processor, other calculations will run on the host,
-including neighbor and pair calculations that are not offloaded, as
-well as angle, bond, dihedral, kspace, and some MPI communications.
-If *split* is set to -1, the fraction of work is dynamically adjusted
-automatically throughout the run.  This typically give performance
+The *balance* keyword sets the fraction of :doc:`pair style <pair_style>` work
+offloaded to the co-processor for split values between 0.0 and 1.0 inclusive.
+While this fraction of work is running on the co-processor, other calculations
+will run on the host, including neighbor and pair calculations that are not
+offloaded, as well as angle, bond, dihedral, kspace, and some MPI
+communications.  If *split* is set to -1, the fraction of work is dynamically
+adjusted automatically throughout the run.  This typically give performance
 within 5 to 10 percent of the optimal fixed fraction.
 
 The *ghost* keyword determines whether or not ghost atoms, i.e. atoms
@@ -430,6 +433,13 @@ with 16 threads, for a total of 128.
 Note that the default settings for *tpc* and *tptask* are fine for
 most problems, regardless of how many MPI tasks you assign to a Phi.
 
+.. versionadded:: TBD
+
+The *pppm_table* keyword with the argument yes allows to use a
+pre-computed table to efficiently spread the charge to the PPPM grid.
+This feature is enabled by default but can be turned off using the
+keyword with the argument *no*.
+
 The *no_affinity* keyword will turn off automatic setting of core
 affinity for MPI tasks and OpenMP threads on the host when using
 offload to a co-processor. Affinity settings are used when possible
@@ -699,39 +709,37 @@ Related commands
 Default
 """""""
 
-For the GPU package, the default is Ngpu = 0 and the option defaults
-are neigh = yes, newton = off, binsize = 0.0, split = 1.0, gpuID = 0
-to Ngpu-1, tpa = 1, omp = 0, and platform=-1.  These settings are made
-automatically if the "-sf gpu" :doc:`command-line switch <Run_options>`
-is used.  If it is not used, you must invoke the package gpu command
-in your input script or via the "-pk gpu" :doc:`command-line switch <Run_options>`.
+For the GPU package, the default is Ngpu = 0 and the option defaults are neigh
+= yes, newton = off, binsize = 0.0, split = 1.0, gpuID = 0 to Ngpu-1, tpa = 1,
+omp = 0, and platform=-1.  These settings are made automatically if the "-sf
+gpu" :doc:`command-line switch <Run_options>` is used.  If it is not used, you
+must invoke the package gpu command in your input script or via the "-pk gpu"
+:doc:`command-line switch <Run_options>`.
 
-For the INTEL package, the default is Nphi = 1 and the option
-defaults are omp = 0, mode = mixed, lrt = no, balance = -1, tpc = 4,
-tptask = 240.  The default ghost option is determined by the pair
-style being used.  This value is output to the screen in the offload
-report at the end of each run.  Note that all of these settings,
-except "omp" and "mode", are ignored if LAMMPS was not built with Xeon
-Phi co-processor support.  These settings are made automatically if the
-"-sf intel" :doc:`command-line switch <Run_options>` is used.  If it is
-not used, you must invoke the package intel command in your input
-script or via the "-pk intel" :doc:`command-line switch <Run_options>`.
+For the INTEL package, the default is Nphi = 1 and the option defaults are omp
+= 0, mode = mixed, lrt = no, balance = -1, tpc = 4, tptask = 240, pppm_table =
+yes.  The default ghost option is determined by the pair style being used.
+This value is output to the screen in the offload report at the end of each
+run.  Note that all of these settings, except "omp" and "mode", are ignored if
+LAMMPS was not built with Xeon Phi co-processor support.  These settings are
+made automatically if the "-sf intel" :doc:`command-line switch <Run_options>`
+is used.  If it is not used, you must invoke the package intel command in your
+input script or via the "-pk intel" :doc:`command-line switch <Run_options>`.
 
 For the KOKKOS package, the option defaults for GPUs are neigh = full,
-neigh/qeq = full, newton = off, binsize for GPUs = 2x LAMMPS default
-value, comm = device, sort = device, neigh/transpose = off, gpu/aware =
-on. When LAMMPS can safely detect that GPU-aware MPI is not available,
-the default value of gpu/aware becomes "off". For CPUs or Xeon Phis, the
-option defaults are neigh = half, neigh/qeq = half, newton = on, binsize
-= 0.0, comm = no, and sort = no.  The option neigh/thread = on when
-there are 16K atoms or less on an MPI rank, otherwise it is "off". These
-settings are made automatically by the required "-k on"
-:doc:`command-line switch <Run_options>`. You can change them by using
-the package kokkos command in your input script or via the :doc:`-pk
+neigh/qeq = full, newton = off, binsize for GPUs = 2x LAMMPS default value,
+comm = device, sort = device, neigh/transpose = off, gpu/aware = on. When
+LAMMPS can safely detect that GPU-aware MPI is not available, the default value
+of gpu/aware becomes "off". For CPUs or Xeon Phis, the option defaults are
+neigh = half, neigh/qeq = half, newton = on, binsize = 0.0, comm = no, and sort
+= no.  The option neigh/thread = on when there are 16K atoms or less on an MPI
+rank, otherwise it is "off". These settings are made automatically by the
+required "-k on" :doc:`command-line switch <Run_options>`. You can change them
+by using the package kokkos command in your input script or via the :doc:`-pk
 kokkos command-line switch <Run_options>`.
 
-For the OMP package, the default is Nthreads = 0 and the option
-defaults are neigh = yes.  These settings are made automatically if
-the "-sf omp" :doc:`command-line switch <Run_options>` is used.  If it
-is not used, you must invoke the package omp command in your input
-script or via the "-pk omp" :doc:`command-line switch <Run_options>`.
+For the OMP package, the default is Nthreads = 0 and the option defaults are
+neigh = yes.  These settings are made automatically if the "-sf omp"
+:doc:`command-line switch <Run_options>` is used.  If it is not used, you must
+invoke the package omp command in your input script or via the "-pk omp"
+:doc:`command-line switch <Run_options>`.
diff --git a/examples/COUPLE/plugin/liblammpsplugin.h b/examples/COUPLE/plugin/liblammpsplugin.h
index 40fba6b9e3..1520ef638d 100644
--- a/examples/COUPLE/plugin/liblammpsplugin.h
+++ b/examples/COUPLE/plugin/liblammpsplugin.h
@@ -37,12 +37,13 @@
 #endif
 
 /* The following enums must be kept in sync with the equivalent enums
- * or constants in python/lammps/constants.py, fortran/lammps.f90,
- * tools/swig/lammps.i, and examples/COUPLE/plugin/liblammpsplugin.h */
+ * or constants in src/library.h, src/lmptype.h, python/lammps/constants.py,
+ * fortran/lammps.f90, and tools/swig/lammps.i */
 
 /* Data type constants for extracting data from atoms, computes and fixes */
 
 enum _LMP_DATATYPE_CONST {
+  LAMMPS_NONE = -1,     /*!< no data type assigned (yet) */
   LAMMPS_INT = 0,       /*!< 32-bit integer (array) */
   LAMMPS_INT_2D = 1,    /*!< two-dimensional 32-bit integer array */
   LAMMPS_DOUBLE = 2,    /*!< 64-bit double (array) */
diff --git a/fortran/lammps.f90 b/fortran/lammps.f90
index f511e6bb60..ba3997ac8e 100644
--- a/fortran/lammps.f90
+++ b/fortran/lammps.f90
@@ -44,11 +44,12 @@ MODULE LIBLAMMPS
   ! Data type constants for extracting data from global, atom, compute, and fix
   !
   ! Must be kept in sync with the equivalent declarations in
-  ! src/library.h, python/lammps/constants.py, tools/swig/lammps.i,
+  ! src/library.h, src/lmptype.h, python/lammps/constants.py, tools/swig/lammps.i,
   ! and examples/COUPLE/plugin/liblammpsplugin.h
   !
   ! These are NOT part of the API (the part the user sees)
   INTEGER(c_int), PARAMETER :: &
+    LAMMPS_NONE = -1, &       ! no data type assigned (yet)
     LAMMPS_INT = 0, &         ! 32-bit integer (or array)
     LAMMPS_INT_2D = 1, &      ! two-dimensional 32-bit integer array
     LAMMPS_DOUBLE = 2, &      ! 64-bit double (or array)
@@ -1126,11 +1127,13 @@ CONTAINS
   FUNCTION lmp_last_thermo(self,what,index) RESULT(thermo_data)
     CLASS(lammps), INTENT(IN), TARGET :: self
     CHARACTER(LEN=*), INTENT(IN) :: what
-    INTEGER(c_int) :: index
+    INTEGER :: index
+    INTEGER(c_int) :: idx
     TYPE(lammps_data) :: thermo_data, type_data
     INTEGER(c_int) :: datatype
     TYPE(c_ptr) :: Cname, Cptr
 
+    idx = index - 1
     ! set data type for known cases
     SELECT CASE (what)
     CASE ('step')
@@ -1147,7 +1150,7 @@ CONTAINS
         datatype = LAMMPS_STRING
     CASE ('data')
         Cname = f2c_string('type')
-        Cptr = lammps_last_thermo(self%handle,Cname,index-1)
+        Cptr = lammps_last_thermo(self%handle,Cname,idx)
         type_data%lammps_instance => self
         type_data%datatype = DATA_INT
         CALL C_F_POINTER(Cptr, type_data%i32)
@@ -1158,7 +1161,7 @@ CONTAINS
     END SELECT
 
     Cname = f2c_string(what)
-    Cptr = lammps_last_thermo(self%handle,Cname,index-1)
+    Cptr = lammps_last_thermo(self%handle,Cname,idx)
     CALL lammps_free(Cname)
 
     thermo_data%lammps_instance => self
diff --git a/python/lammps/constants.py b/python/lammps/constants.py
index 8fd6a9eaf5..1d0d8adb78 100644
--- a/python/lammps/constants.py
+++ b/python/lammps/constants.py
@@ -13,7 +13,13 @@
 
 # various symbolic constants to be used
 # in certain calls to select data formats
+
+# these must be kept in sync with the enums in src/library.h, src/lmptype.h,
+# tools/swig/lammps.i, examples/COUPLE/plugin/liblammpsplugin.h,
+# and the constants in fortran/lammps.f90
+
 LAMMPS_AUTODETECT  = None
+LAMMPS_NONE        = -1
 LAMMPS_INT         = 0
 LAMMPS_INT_2D      = 1
 LAMMPS_DOUBLE      = 2
@@ -22,8 +28,6 @@ LAMMPS_INT64       = 4
 LAMMPS_INT64_2D    = 5
 LAMMPS_STRING      = 6
 
-# these must be kept in sync with the enums in src/library.h, tools/swig/lammps.i,
-# examples/COUPLE/plugin/liblammpsplugin.h, and the constants in fortran/lammps.f90
 LMP_STYLE_GLOBAL   = 0
 LMP_STYLE_ATOM     = 1
 LMP_STYLE_LOCAL    = 2
diff --git a/python/lammps/core.py b/python/lammps/core.py
index d34d5de4d4..84a80e77a3 100644
--- a/python/lammps/core.py
+++ b/python/lammps/core.py
@@ -746,6 +746,16 @@ class lammps(object):
       return self.lib.lammps_get_thermo(self.lmp,name)
 
   # -------------------------------------------------------------------------
+  @property
+  def last_thermo_step(self):
+    """ Get the last timestep where thermodynamic data was computed
+
+    :return: the timestep or a negative number if there has not been any thermo output yet
+    :rtype: int
+    """
+    with ExceptionCheck(self):
+      ptr = self.lib.lammps_last_thermo(self.lmp, c_char_p("step".encode()), 0)
+    return cast(ptr, POINTER(self.c_bigint)).contents.value
 
   def last_thermo(self):
     """Get a dictionary of the last thermodynamic output
@@ -755,14 +765,12 @@ class lammps(object):
     data from the last timestep into a dictionary.  The return value
     is None, if there has not been any thermo output yet.
 
-    :return: value of thermo keyword
+    :return: a dictionary containing the last computed thermo output values
     :rtype: dict or None
     """
 
     rv = dict()
-    with ExceptionCheck(self):
-      ptr = self.lib.lammps_last_thermo(self.lmp, c_char_p("step".encode()), 0)
-    mystep = cast(ptr, POINTER(self.c_bigint)).contents.value
+    mystep = self.last_thermo_step
     if mystep < 0:
       return None
 
diff --git a/python/lammps/pylammps.py b/python/lammps/pylammps.py
index d0ff7ab1aa..bdbed3e971 100644
--- a/python/lammps/pylammps.py
+++ b/python/lammps/pylammps.py
@@ -13,7 +13,7 @@
 
 ################################################################################
 # Alternative Python Wrapper
-# Written by Richard Berger <richard.berger@temple.edu>
+# Written by Richard Berger <richard.berger@outlook.com>
 ################################################################################
 
 # for python2/3 compatibility
@@ -28,6 +28,7 @@ import tempfile
 from collections import namedtuple
 
 from .core import lammps
+from .constants import *                # lgtm [py/polluting-import]
 
 # -------------------------------------------------------------------------
 
@@ -65,22 +66,43 @@ class OutputCapture(object):
 # -------------------------------------------------------------------------
 
 class Variable(object):
-  def __init__(self, pylammps_instance, name, style, definition):
+  def __init__(self, pylammps_instance, name):
     self._pylmp = pylammps_instance
     self.name = name
-    self.style = style
-    self.definition = definition.split()
+
+  @property
+  def style(self):
+    vartype = self._pylmp.lmp.lib.lammps_extract_variable_datatype(self._pylmp.lmp.lmp, self.name.encode())
+    if vartype == LMP_VAR_EQUAL:
+      return "equal"
+    elif vartype == LMP_VAR_ATOM:
+      return "atom"
+    elif vartype == LMP_VAR_VECTOR:
+      return "vector"
+    elif vartype == LMP_VAR_STRING:
+      return "string"
+    return None
 
   @property
   def value(self):
-    if self.style == 'atom':
-      return list(self._pylmp.lmp.extract_variable(self.name, "all", 1))
+    return self._pylmp.lmp.extract_variable(self.name)
+
+  @value.setter
+  def value(self, newvalue):
+    style = self.style
+    if style == "equal" or style == "string":
+      self._pylmp.variable("{} {} {}".format(self.name, style, newvalue))
     else:
-      value = self._pylmp.lmp_print('"${%s}"' % self.name).strip()
-      try:
-        return float(value)
-      except ValueError:
-        return value
+      raise Exception("Setter not implemented for {} style variables.".format(style))
+
+  def __str__(self):
+    value = self.value
+    if isinstance(value, str):
+      value = "\"{}\"".format(value)
+    return "Variable(name=\"{}\", value={})".format(self.name, value)
+
+  def __repr__(self):
+    return self.__str__()
 
 # -------------------------------------------------------------------------
 
@@ -377,55 +399,6 @@ class variable_set:
     def __repr__(self):
         return self.__str__()
 
-# -------------------------------------------------------------------------
-
-def get_thermo_data(output):
-    """ traverse output of runs and extract thermo data columns """
-    if isinstance(output, str):
-        lines = output.splitlines()
-    else:
-        lines = output
-
-    runs = []
-    columns = []
-    in_run = False
-    current_run = {}
-
-    for line in lines:
-        if line.startswith("Per MPI rank memory allocation"):
-            in_run = True
-        elif in_run and len(columns) == 0:
-            # first line after memory usage are column names
-            columns = line.split()
-
-            current_run = {}
-
-            for col in columns:
-                current_run[col] = []
-
-        elif line.startswith("Loop time of "):
-            in_run = False
-            columns = []
-            thermo_data = variable_set('ThermoData', current_run)
-            r = {'thermo' : thermo_data }
-            runs.append(namedtuple('Run', list(r.keys()))(*list(r.values())))
-        elif in_run and len(columns) > 0:
-            items = line.split()
-            # Convert thermo output and store it.
-            # It must have the same number of columns and
-            # all of them must be convertible to floats.
-            # Otherwise we ignore the line
-            if len(items) == len(columns):
-                try:
-                    values = [float(x) for x in items]
-                    for i, col in enumerate(columns):
-                        current_run[col].append(values[i])
-                except ValueError:
-                  # cannot convert. must be a non-thermo output. ignore.
-                  pass
-
-    return runs
-
 # -------------------------------------------------------------------------
 # -------------------------------------------------------------------------
 
@@ -483,6 +456,9 @@ class PyLammps(object):
     self._enable_cmd_history = False
     self.runs = []
 
+    if not self.lmp.has_package("PYTHON"):
+      print("WARNING: run thermo data not captured since PYTHON LAMMPS package is not enabled")
+
   def __enter__(self):
     return self
 
@@ -573,16 +549,49 @@ class PyLammps(object):
     if self.enable_cmd_history:
       self._cmd_history.append(cmd)
 
+  def _append_run_thermo(self, thermo):
+    for k, v in thermo.items():
+      if k in self._current_run:
+        self._current_run[k].append(v)
+      else:
+        self._current_run[k] = [v]
+
   def run(self, *args, **kwargs):
     """
     Execute LAMMPS run command with given arguments
 
-    All thermo output during the run is captured and saved as new entry in
+    Thermo data of the run is recorded and saved as new entry in
     :py:attr:`PyLammps.runs`. The latest run can be retrieved by
     :py:attr:`PyLammps.last_run`.
+
+    Note, for recording of all thermo steps during a run, the PYTHON package
+    needs to be enabled in LAMMPS. Otherwise, it will only capture the final
+    timestep.
     """
+    self._current_run = {}
+    self._last_thermo_step = -1
+    def end_of_step_callback(lmp):
+      if self.lmp.last_thermo_step == self._last_thermo_step: return
+      thermo = self.lmp.last_thermo()
+      self._append_run_thermo(thermo)
+      self._last_thermo_step = thermo['Step']
+
+    import __main__
+    __main__._PyLammps_end_of_step_callback = end_of_step_callback
+    capture_thermo = self.lmp.has_package("PYTHON")
+
+    if capture_thermo:
+        self.fix("__pylammps_internal_run_callback", "all", "python/invoke", "1", "end_of_step", "_PyLammps_end_of_step_callback")
+
     output = self.__getattr__('run')(*args, **kwargs)
-    self.runs += get_thermo_data(output)
+
+    if capture_thermo:
+        self.unfix("__pylammps_internal_run_callback")
+    self._append_run_thermo(self.lmp.last_thermo())
+
+    thermo_data = variable_set('ThermoData', self._current_run)
+    r = {'thermo' : thermo_data }
+    self.runs.append(namedtuple('Run', list(r.keys()))(*list(r.values())))
     return output
 
   @property
@@ -677,9 +686,7 @@ class PyLammps(object):
     :getter: Returns a list of atom groups that are currently active in this LAMMPS instance
     :type: list
     """
-    output = self.lmp_info("groups")
-    output = output[output.index("Group information:")+1:]
-    return self._parse_groups(output)
+    return self.lmp.available_ids("group")
 
   @property
   def variables(self):
@@ -689,11 +696,9 @@ class PyLammps(object):
     :getter: Returns a dictionary of all variables that are defined in this LAMMPS instance
     :type: dict
     """
-    output = self.lmp_info("variables")
-    output = output[output.index("Variable information:")+1:]
     variables = {}
-    for v in self._parse_element_list(output):
-      variables[v['name']] = Variable(self, v['name'], v['style'], v['def'])
+    for name in self.lmp.available_ids("variable"):
+      variables[name] = Variable(self, name)
     return variables
 
   def eval(self, expr):
@@ -720,66 +725,55 @@ class PyLammps(object):
 
   def _parse_info_system(self, output):
     system = {}
+    system['dimensions'] = self.lmp.extract_setting("dimension")
+    system['xlo'] = self.lmp.extract_global("boxxlo")
+    system['ylo'] = self.lmp.extract_global("boxylo")
+    system['zlo'] = self.lmp.extract_global("boxzlo")
+    system['xhi'] = self.lmp.extract_global("boxxhi")
+    system['yhi'] = self.lmp.extract_global("boxyhi")
+    system['zhi'] = self.lmp.extract_global("boxzhi")
+    xprd = system["xhi"] - system["xlo"]
+    yprd = system["yhi"] - system["ylo"]
+    zprd = system["zhi"] - system["zlo"]
+    if self.lmp.extract_setting("triclinic") == 1:
+      system['triclinic_box'] = (xprd, yprd, zprd)
+    else:
+      system['orthogonal_box'] = (xprd, yprd, zprd)
+    system['nangles'] = self.lmp.extract_global("nbonds")
+    system['nangletypes'] = self.lmp.extract_setting("nbondtypes")
+    system['angle_style'] = self.lmp.extract_global("angle_style")
+    system['nbonds'] = self.lmp.extract_global("nbonds")
+    system['nbondtypes'] = self.lmp.extract_setting("nbondtypes")
+    system['bond_style'] = self.lmp.extract_global("bond_style")
+    system['ndihedrals'] = self.lmp.extract_global("ndihedrals")
+    system['ndihedraltypes'] = self.lmp.extract_setting("ndihedraltypes")
+    system['dihedral_style'] = self.lmp.extract_global("dihedral_style")
+    system['nimpropers'] = self.lmp.extract_global("nimpropers")
+    system['nimpropertypes'] = self.lmp.extract_setting("nimpropertypes")
+    system['improper_style'] = self.lmp.extract_global("improper_style")
+    system['kspace_style'] = self.lmp.extract_global("kspace_style")
+    system['natoms'] = self.lmp.extract_global("natoms")
+    system['ntypes'] = self.lmp.extract_global("ntypes")
+    system['pair_style'] = self.lmp.extract_global("pair_style")
+    system['atom_style'] = self.lmp.extract_global("atom_style")
+    system['units'] = self.lmp.extract_global("units")
 
     for line in output:
-      if line.startswith("Units"):
-        system['units'] = self._get_pair(line)[1]
-      elif line.startswith("Atom style"):
-        system['atom_style'] = self._get_pair(line)[1]
-      elif line.startswith("Atom map"):
+      if line.startswith("Atom map"):
         system['atom_map'] = self._get_pair(line)[1]
       elif line.startswith("Atoms"):
         parts = self._split_values(line)
-        system['natoms'] = int(self._get_pair(parts[0])[1])
-        system['ntypes'] = int(self._get_pair(parts[1])[1])
-        system['style'] = self._get_pair(parts[2])[1]
-      elif line.startswith("Kspace style"):
-        system['kspace_style'] = self._get_pair(line)[1]
-      elif line.startswith("Dimensions"):
-        system['dimensions'] = int(self._get_pair(line)[1])
-      elif line.startswith("Orthogonal box"):
-        system['orthogonal_box'] = [float(x) for x in self._get_pair(line)[1].split('x')]
       elif line.startswith("Boundaries"):
         system['boundaries'] = self._get_pair(line)[1]
-      elif line.startswith("xlo"):
-        keys, values = [self._split_values(x) for x in self._get_pair(line)]
-        for key, value in zip(keys, values):
-          system[key] = float(value)
-      elif line.startswith("ylo"):
-        keys, values = [self._split_values(x) for x in self._get_pair(line)]
-        for key, value in zip(keys, values):
-          system[key] = float(value)
-      elif line.startswith("zlo"):
-        keys, values = [self._split_values(x) for x in self._get_pair(line)]
-        for key, value in zip(keys, values):
-          system[key] = float(value)
       elif line.startswith("Molecule type"):
         system['molecule_type'] = self._get_pair(line)[1]
-      elif line.startswith("Bonds"):
-        parts = self._split_values(line)
-        system['nbonds'] = int(self._get_pair(parts[0])[1])
-        system['nbondtypes'] = int(self._get_pair(parts[1])[1])
-        system['bond_style'] = self._get_pair(parts[2])[1]
-      elif line.startswith("Angles"):
-        parts = self._split_values(line)
-        system['nangles'] = int(self._get_pair(parts[0])[1])
-        system['nangletypes'] = int(self._get_pair(parts[1])[1])
-        system['angle_style'] = self._get_pair(parts[2])[1]
-      elif line.startswith("Dihedrals"):
-        parts = self._split_values(line)
-        system['ndihedrals'] = int(self._get_pair(parts[0])[1])
-        system['ndihedraltypes'] = int(self._get_pair(parts[1])[1])
-        system['dihedral_style'] = self._get_pair(parts[2])[1]
-      elif line.startswith("Impropers"):
-        parts = self._split_values(line)
-        system['nimpropers'] = int(self._get_pair(parts[0])[1])
-        system['nimpropertypes'] = int(self._get_pair(parts[1])[1])
-        system['improper_style'] = self._get_pair(parts[2])[1]
 
     return system
 
   def _parse_info_communication(self, output):
     comm = {}
+    comm['nprocs'] = self.lmp.extract_setting("world_size")
+    comm['nthreads'] = self.lmp.extract_setting("nthreads")
 
     for line in output:
       if line.startswith("MPI library"):
@@ -792,10 +786,6 @@ class PyLammps(object):
         comm['proc_grid'] = [int(x) for x in self._get_pair(line)[1].split('x')]
       elif line.startswith("Communicate velocities for ghost atoms"):
         comm['ghost_velocity'] = (self._get_pair(line)[1] == "yes")
-      elif line.startswith("Nprocs"):
-        parts = self._split_values(line)
-        comm['nprocs'] = int(self._get_pair(parts[0])[1])
-        comm['nthreads'] = int(self._get_pair(parts[1])[1])
     return comm
 
   def _parse_element_list(self, output):
@@ -810,16 +800,6 @@ class PyLammps(object):
       elements.append(element)
     return elements
 
-  def _parse_groups(self, output):
-    groups = []
-    group_pattern = re.compile(r"(?P<name>.+) \((?P<type>.+)\)")
-
-    for line in output:
-      m = group_pattern.match(line.split(':')[1].strip())
-      group = {'name': m.group('name'), 'type': m.group('type')}
-      groups.append(group)
-    return groups
-
   def lmp_print(self, s):
     """ needed for Python2 compatibility, since print is a reserved keyword """
     return self.__getattr__("print")(s)
diff --git a/src/BODY/fix_wall_body_polygon.cpp b/src/BODY/fix_wall_body_polygon.cpp
index 4d0517e527..6f0622cbf6 100644
--- a/src/BODY/fix_wall_body_polygon.cpp
+++ b/src/BODY/fix_wall_body_polygon.cpp
@@ -44,7 +44,7 @@ enum {FAR=0,XLO,XHI,YLO,YHI};
 
 //#define _POLYGON_DEBUG
 #define DELTA 10000
-#define EPSILON 1e-2
+#define EPSILON 1e-2    // dimensionless threshold (dot products, end point checks, contact checks)
 #define BIG 1.0e20
 #define MAX_CONTACTS 4  // maximum number of contacts for 2D models
 #define EFF_CONTACTS 2  // effective contacts for 2D models
diff --git a/src/BODY/fix_wall_body_polyhedron.cpp b/src/BODY/fix_wall_body_polyhedron.cpp
index 212830eb81..546ef1f0d4 100644
--- a/src/BODY/fix_wall_body_polyhedron.cpp
+++ b/src/BODY/fix_wall_body_polyhedron.cpp
@@ -42,7 +42,7 @@ enum {FAR=0,XLO,XHI,YLO,YHI,ZLO,ZHI};
 
 //#define _POLYHEDRON_DEBUG
 #define DELTA 10000
-#define EPSILON 1e-2
+#define EPSILON 1e-3    // dimensionless threshold (dot products, end point checks)
 #define BIG 1.0e20
 #define MAX_CONTACTS 4  // maximum number of contacts for 2D models
 #define EFF_CONTACTS 2  // effective contacts for 2D models
diff --git a/src/BODY/pair_body_rounded_polygon.cpp b/src/BODY/pair_body_rounded_polygon.cpp
index 0e83b63f99..24f38a6a0a 100644
--- a/src/BODY/pair_body_rounded_polygon.cpp
+++ b/src/BODY/pair_body_rounded_polygon.cpp
@@ -40,7 +40,7 @@
 using namespace LAMMPS_NS;
 
 #define DELTA 10000
-#define EPSILON 1e-3
+#define EPSILON 1e-3    // dimensionless threshold (dot products, end point checks, contact checks)
 #define MAX_CONTACTS 4  // maximum number of contacts for 2D models
 #define EFF_CONTACTS 2  // effective contacts for 2D models
 
@@ -624,7 +624,8 @@ void PairBodyRoundedPolygon::sphere_against_sphere(int i, int j,
   fy = dely*fpair/rij;
   fz = delz*fpair/rij;
 
-  if (R <= EPSILON) { // in contact
+  double rmin = MIN(rradi, rradj);
+  if (R <= EPSILON*rmin) { // in contact
 
     // relative translational velocity
 
@@ -1019,6 +1020,7 @@ int PairBodyRoundedPolygon::compute_distance_to_vertex(int ibody,
   double xi1[3],xi2[3],u[3],v[3],uij[3];
   double udotv, magv, magucostheta;
   double delx,dely,delz;
+  double rmin = MIN(rounded_radius, x0_rounded_radius);
 
   ifirst = dfirst[ibody];
   iefirst = edfirst[ibody];
@@ -1105,17 +1107,17 @@ int PairBodyRoundedPolygon::compute_distance_to_vertex(int ibody,
       // x0 and xmi are on the different sides
       // t is the ratio to detect if x0 is closer to the vertices xi or xj
 
-      if (fabs(xi2[0] - xi1[0]) > EPSILON)
+      if (fabs(xi2[0] - xi1[0]) > EPSILON*rmin)
         t = (hi[0] - xi1[0]) / (xi2[0] - xi1[0]);
-      else if (fabs(xi2[1] - xi1[1]) > EPSILON)
+      else if (fabs(xi2[1] - xi1[1]) > EPSILON*rmin)
         t = (hi[1] - xi1[1]) / (xi2[1] - xi1[1]);
-      else if (fabs(xi2[2] - xi1[2]) > EPSILON)
+      else if (fabs(xi2[2] - xi1[2]) > EPSILON*rmin)
         t = (hi[2] - xi1[2]) / (xi2[2] - xi1[2]);
 
       double contact_dist = rounded_radius + x0_rounded_radius;
       if (t >= 0 && t <= 1) {
         mode = EDGE;
-        if (d < contact_dist + EPSILON)
+        if (d < contact_dist + EPSILON*rmin)
           contact = 1;
 
       } else { // t < 0 || t > 1: closer to either vertices of the edge
@@ -1293,8 +1295,14 @@ double PairBodyRoundedPolygon::contact_separation(const Contact& c1,
   double x3 = c2.xv[0];
   double y3 = c2.xv[1];
 
+  int ibody = c1.ibody;
+  int jbody = c1.ibody;
+  double rradi = rounded_radius[ibody];
+  double rradj = rounded_radius[jbody];
+  double rmin = MIN(rradi, rradj);
+
   double delta_a = 0.0;
-  if (fabs(x2 - x1) > EPSILON) {
+  if (fabs(x2 - x1) > EPSILON*rmin) {
     double A = (y2 - y1) / (x2 - x1);
     delta_a = fabs(y1 - A * x1 - y3 + A * x3) / sqrt(1 + A * A);
   } else {
diff --git a/src/BODY/pair_body_rounded_polyhedron.cpp b/src/BODY/pair_body_rounded_polyhedron.cpp
index 2c23199e33..d9a79abad0 100644
--- a/src/BODY/pair_body_rounded_polyhedron.cpp
+++ b/src/BODY/pair_body_rounded_polyhedron.cpp
@@ -44,7 +44,7 @@ using namespace LAMMPS_NS;
 using namespace MathConst;
 
 #define DELTA 10000
-#define EPSILON 1e-3
+#define EPSILON 1e-3     // dimensionless threshold (dot products, end point checks, contact checks)
 #define MAX_FACE_SIZE 4  // maximum number of vertices per face (same as BodyRoundedPolyhedron)
 #define MAX_CONTACTS 32  // for 3D models (including duplicated counts)
 
@@ -1186,7 +1186,13 @@ int PairBodyRoundedPolyhedron::interaction_edge_to_edge(int ibody,
 
   // singularity case, ignore interactions
 
-  if (r < EPSILON) return interact;
+  double rmin = MIN(rounded_radius_i, rounded_radius_j);
+  if (r < EPSILON*rmin) {
+    #ifdef _POLYHEDRON_DEBUG
+    printf("ignore interaction: r = %0.16f\n", r);
+    #endif
+    return interact;
+  }
 
   // include the vertices for interactions
 
@@ -1898,10 +1904,12 @@ void PairBodyRoundedPolyhedron::inside_polygon(int ibody, int face_index,
 
 {
   int i,n,ifirst,iffirst,npi1,npi2;
-  double xi1[3],xi2[3],u[3],v[3],costheta,anglesum1,anglesum2,magu,magv;
+  double xi1[3],xi2[3],u[3],v[3],costheta,anglesum1,anglesum2,magu,magv,rradi;
 
   ifirst = dfirst[ibody];
   iffirst = facfirst[ibody];
+  rradi = rounded_radius[ibody];
+  double rradsq = rradi*rradi;
   anglesum1 = anglesum2 = 0;;
   for (i = 0; i < MAX_FACE_SIZE; i++) {
     npi1 = static_cast<int>(face[iffirst+face_index][i]);
@@ -1929,7 +1937,7 @@ void PairBodyRoundedPolyhedron::inside_polygon(int ibody, int face_index,
 
     // the point is at either vertices
 
-    if (magu * magv < EPSILON) inside1 = 1;
+    if (magu * magv < EPSILON*rradsq) inside1 = 1;
     else {
       costheta = MathExtra::dot3(u,v)/(magu*magv);
       anglesum1 += acos(costheta);
@@ -1940,7 +1948,7 @@ void PairBodyRoundedPolyhedron::inside_polygon(int ibody, int face_index,
       MathExtra::sub3(xi2,q2,v);
       magu = MathExtra::len3(u);
       magv = MathExtra::len3(v);
-      if (magu * magv < EPSILON) inside2 = 1;
+      if (magu * magv < EPSILON*rradsq) inside2 = 1;
       else {
         costheta = MathExtra::dot3(u,v)/(magu*magv);
         anglesum2 += acos(costheta);
@@ -2338,7 +2346,12 @@ void PairBodyRoundedPolyhedron::find_unique_contacts(Contact* contact_list,
     for (int j = i + 1; j < n; j++) {
       if (contact_list[i].unique == 0) continue;
       double d = contact_separation(contact_list[i], contact_list[j]);
-      if (d < EPSILON) contact_list[j].unique = 0;
+      int ibody = contact_list[i].ibody;
+      int jbody = contact_list[i].jbody;
+      double rradi = rounded_radius[ibody];
+      double rradj = rounded_radius[jbody];
+      double rmin = MIN(rradi, rradj);
+      if (d < EPSILON*rmin) contact_list[j].unique = 0;
     }
   }
 }
diff --git a/src/EXTRA-DUMP/dump_yaml.cpp b/src/EXTRA-DUMP/dump_yaml.cpp
index 47cab1ee02..3ca5c59edf 100644
--- a/src/EXTRA-DUMP/dump_yaml.cpp
+++ b/src/EXTRA-DUMP/dump_yaml.cpp
@@ -71,11 +71,11 @@ void DumpYAML::write_header(bigint ndump)
       thermo_data += "]\n  - data: [ ";
 
       for (int i = 0; i < nfield; ++i) {
-        if (fields[i].type == multitype::DOUBLE)
+        if (fields[i].type == multitype::LAMMPS_DOUBLE)
           thermo_data += fmt::format("{}, ", fields[i].data.d);
-        else if (fields[i].type == multitype::INT)
+        else if (fields[i].type == multitype::LAMMPS_INT)
           thermo_data += fmt::format("{}, ", fields[i].data.i);
-        else if (fields[i].type == multitype::BIGINT)
+        else if (fields[i].type == multitype::LAMMPS_INT64)
           thermo_data += fmt::format("{}, ", fields[i].data.b);
         else
           thermo_data += ", ";
diff --git a/src/INTEL/TEST/run_benchmarks.sh b/src/INTEL/TEST/run_benchmarks.sh
index 01ddfbebf3..82eb51c928 100755
--- a/src/INTEL/TEST/run_benchmarks.sh
+++ b/src/INTEL/TEST/run_benchmarks.sh
@@ -7,12 +7,10 @@
 # --------------------- MPI Launch Command
 
 export MPI="mpirun"
-#export MPI="numactl -p 1 mpirun"    # -- Systems w/ MCDRAM in flat mode
 
 # ------------- Name and location of the LAMMPS binary
 
 export LMP_BIN=../../lmp_intel_cpu_intelmpi
-#export LMP_BIN=../../lmp_knl
 
 # ------------- Directory containing the LAMMPS installation
 
@@ -20,21 +18,18 @@ export LMP_ROOT=../../../
 
 # ------------- Number of physical cores (not HW threads)
 
-export LMP_CORES=36            # -- For Intel Xeon E5-2697v4 SKU
-#export LMP_CORES=68           # -- For Intel Xeon Phi x200 7250 SKU
+export LMP_CORES=36
+# Set automatically with lscpu
+export LMP_CORES=`lscpu | awk '$1=="Core(s)"{t=NF; cores=$t}$1=="Socket(s):"{t=NF; sockets=$t}END{print cores*sockets}'`
 
 # ------------- Number of HW threads to use in tests
 
-export LMP_THREAD_LIST="2"     # -- For 2 threads per core w/ HT enabled
-#export LMP_THREAD_LIST="2 4"   # -- For 2 threads per core w/ HT enabled
+export LMP_THREAD_LIST="1 2"   # -- Also test 2 threads per core w/ HT enabled
 
 # ------------- MPI Tuning Parameters
 
-#export I_MPI_SHM_LMT=shm      # -- Uncomment for Xeon Phi x200 series
-
-# ------------- Library locations for build
-
-#source /opt/intel/parallel_studio_xe_2017.2.050/psxevars.sh
+export I_MPI_FABRICS=shm
+export I_MPI_PIN_DOMAIN=core
 
 #########################################################################
 # End settings for your system
@@ -50,8 +45,6 @@ export DATE_STRING=`date +%s`
 export LOG_DIR=$LOG_DIR_HOST"_"$LOG_DIR_HEADER"_"$DATE_STRING
 mkdir $LOG_DIR
 
-export I_MPI_PIN_DOMAIN=core
-#export I_MPI_FABRICS=shm
 export KMP_BLOCKTIME=0
 
 echo -n "Creating restart file...."
@@ -73,6 +66,9 @@ do
       export LOGFILE=$LOG_DIR/$workload"_lrt".$LMP_CORES"c"$threads"t".log
       cmd="$MPI -np $LMP_CORES $LMP_BIN -in in.intel.$workload -log $LOGFILE $RLMP_ARGS";
       rthreads=$((threads-1))
+      if [ $rthreads -lt 1 ]; then
+          rthreads=1
+      fi
       export KMP_AFFINITY=none
       export OMP_NUM_THREADS=$rthreads
       echo "  $cmd" >> $LOG_DIR/commands.info
@@ -82,4 +78,5 @@ do
 done
 
 # Performance reported by LAMMPS (Timesteps/second ignoring warm-up run)
-grep Perf $LOG_DIR/*.log | awk 'BEGIN{n=1}n%2==0{print $0}{n++}' | sed 's/\/day//g' | sed 's/steps\/s/steps_s/g' | sed 's/hours\/ns//g' | sed 's/.*\///g' | sed 's/\.log:Performance://g' | awk '{c=NF-1; print $1,$c}'
+grep Perf $LOG_DIR/*.log | awk 'n%2==1{c=NF-3; print $1,$c}{n++}' | sed -e 's/.*\///g' -e 's/:.*://g'
+
diff --git a/src/INTEL/angle_charmm_intel.cpp b/src/INTEL/angle_charmm_intel.cpp
index 1ea9c8348e..e13448e28a 100644
--- a/src/INTEL/angle_charmm_intel.cpp
+++ b/src/INTEL/angle_charmm_intel.cpp
@@ -178,7 +178,7 @@ void AngleCharmmIntel::eval(const int vflag,
       const flt_t delz1 = x[i1].z - x[i2].z;
 
       const flt_t rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
-      flt_t ir12 = (flt_t)1.0/sqrt(rsq1);
+      flt_t ir12 = (flt_t)1.0/std::sqrt(rsq1);
 
       // 2nd bond
 
@@ -187,7 +187,7 @@ void AngleCharmmIntel::eval(const int vflag,
       const flt_t delz2 = x[i3].z - x[i2].z;
 
       const flt_t rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
-      ir12 *= (flt_t)1.0/sqrt(rsq2);
+      ir12 *= (flt_t)1.0/std::sqrt(rsq2);
 
       // Urey-Bradley bond
 
@@ -196,7 +196,7 @@ void AngleCharmmIntel::eval(const int vflag,
       const flt_t delzUB = x[i3].z - x[i1].z;
 
       const flt_t rsqUB = delxUB*delxUB + delyUB*delyUB + delzUB*delzUB;
-      const flt_t irUB = (flt_t)1.0/sqrt(rsqUB);
+      const flt_t irUB = (flt_t)1.0/std::sqrt(rsqUB);
 
       // Urey-Bradley force & energy
 
@@ -219,12 +219,12 @@ void AngleCharmmIntel::eval(const int vflag,
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
 
       const flt_t sd = (flt_t)1.0 - c * c;
-      flt_t s = (flt_t)1.0 / sqrt(sd);
+      flt_t s = (flt_t)1.0 / std::sqrt(sd);
       if (sd < SMALL2) s = INVSMALL;
 
       // harmonic force & energy
 
-      const flt_t dtheta = acos(c) - fc.fc[type].theta0;
+      const flt_t dtheta = std::acos(c) - fc.fc[type].theta0;
       const flt_t tk = fc.fc[type].k * dtheta;
 
       if (EFLAG) eangle += tk*dtheta;
diff --git a/src/INTEL/angle_harmonic_intel.cpp b/src/INTEL/angle_harmonic_intel.cpp
index 8cdab1d43a..98125b164b 100644
--- a/src/INTEL/angle_harmonic_intel.cpp
+++ b/src/INTEL/angle_harmonic_intel.cpp
@@ -178,7 +178,7 @@ void AngleHarmonicIntel::eval(const int vflag,
       const flt_t delz1 = x[i1].z - x[i2].z;
 
       const flt_t rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
-      const flt_t r1 = (flt_t)1.0/sqrt(rsq1);
+      const flt_t r1 = (flt_t)1.0/std::sqrt(rsq1);
 
       // 2nd bond
 
@@ -187,7 +187,7 @@ void AngleHarmonicIntel::eval(const int vflag,
       const flt_t delz2 = x[i3].z - x[i2].z;
 
       const flt_t rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
-      const flt_t r2 = (flt_t)1.0/sqrt(rsq2);
+      const flt_t r2 = (flt_t)1.0/std::sqrt(rsq2);
 
       // angle (cos and sin)
 
@@ -199,12 +199,12 @@ void AngleHarmonicIntel::eval(const int vflag,
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
 
       const flt_t sd = (flt_t)1.0 - c * c;
-      flt_t s = (flt_t)1.0/sqrt(sd);
+      flt_t s = (flt_t)1.0/std::sqrt(sd);
       if (sd < SMALL2) s = INVSMALL;
 
       // harmonic force & energy
 
-      const flt_t dtheta = acos(c) - fc.fc[type].theta0;
+      const flt_t dtheta = std::acos(c) - fc.fc[type].theta0;
       const flt_t tk = fc.fc[type].k * dtheta;
 
       flt_t eangle;
diff --git a/src/INTEL/bond_harmonic_intel.cpp b/src/INTEL/bond_harmonic_intel.cpp
index a60050bb6b..ab38cefea4 100644
--- a/src/INTEL/bond_harmonic_intel.cpp
+++ b/src/INTEL/bond_harmonic_intel.cpp
@@ -168,7 +168,7 @@ void BondHarmonicIntel::eval(const int vflag,
       const flt_t delz = x[i1].z - x[i2].z;
 
       const flt_t rsq = delx*delx + dely*dely + delz*delz;
-      const flt_t r = sqrt(rsq);
+      const flt_t r = std::sqrt(rsq);
       const flt_t dr = r - fc.fc[type].r0;
       const flt_t rk = fc.fc[type].k * dr;
 
diff --git a/src/INTEL/dihedral_charmm_intel.cpp b/src/INTEL/dihedral_charmm_intel.cpp
index 6c3ae2c927..a41cfb867c 100644
--- a/src/INTEL/dihedral_charmm_intel.cpp
+++ b/src/INTEL/dihedral_charmm_intel.cpp
@@ -240,14 +240,14 @@ void DihedralCharmmIntel::eval(const int vflag,
       const flt_t rasq = ax*ax + ay*ay + az*az;
       const flt_t rbsq = bx*bx + by*by + bz*bz;
       const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
-      const flt_t rg = sqrt(rgsq);
+      const flt_t rg = std::sqrt(rgsq);
 
       flt_t rginv, ra2inv, rb2inv;
       rginv = ra2inv = rb2inv = (flt_t)0.0;
       if (rg > 0) rginv = (flt_t)1.0/rg;
       if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
       if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
-      const flt_t rabinv = sqrt(ra2inv*rb2inv);
+      const flt_t rabinv = std::sqrt(ra2inv*rb2inv);
 
       flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
       const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
@@ -367,7 +367,7 @@ void DihedralCharmmIntel::eval(const int vflag,
 
       flt_t forcecoul;
       if (implicit) forcecoul = qqrd2e * q[i1]*q[i4]*r2inv;
-      else forcecoul = qqrd2e * q[i1]*q[i4]*sqrt(r2inv);
+      else forcecoul = qqrd2e * q[i1]*q[i4]*std::sqrt(r2inv);
       const flt_t forcelj = r6inv * (fc.ljp[itype][jtype].lj1*r6inv -
                                      fc.ljp[itype][jtype].lj2);
       const flt_t fpair = tweight * (forcelj+forcecoul)*r2inv;
diff --git a/src/INTEL/dihedral_fourier_intel.cpp b/src/INTEL/dihedral_fourier_intel.cpp
index 5448fdae98..595d747839 100644
--- a/src/INTEL/dihedral_fourier_intel.cpp
+++ b/src/INTEL/dihedral_fourier_intel.cpp
@@ -199,14 +199,14 @@ void DihedralFourierIntel::eval(const int vflag,
       const flt_t rasq = ax*ax + ay*ay + az*az;
       const flt_t rbsq = bx*bx + by*by + bz*bz;
       const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
-      const flt_t rg = sqrt(rgsq);
+      const flt_t rg = std::sqrt(rgsq);
 
       flt_t rginv, ra2inv, rb2inv;
       rginv = ra2inv = rb2inv = (flt_t)0.0;
       if (rg > 0) rginv = (flt_t)1.0/rg;
       if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
       if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
-      const flt_t rabinv = sqrt(ra2inv*rb2inv);
+      const flt_t rabinv = std::sqrt(ra2inv*rb2inv);
 
       flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
       const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
diff --git a/src/INTEL/dihedral_harmonic_intel.cpp b/src/INTEL/dihedral_harmonic_intel.cpp
index 8d91a3fd27..138545d94a 100644
--- a/src/INTEL/dihedral_harmonic_intel.cpp
+++ b/src/INTEL/dihedral_harmonic_intel.cpp
@@ -199,14 +199,14 @@ void DihedralHarmonicIntel::eval(const int vflag,
       const flt_t rasq = ax*ax + ay*ay + az*az;
       const flt_t rbsq = bx*bx + by*by + bz*bz;
       const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
-      const flt_t rg = sqrt(rgsq);
+      const flt_t rg = std::sqrt(rgsq);
 
       flt_t rginv, ra2inv, rb2inv;
       rginv = ra2inv = rb2inv = (flt_t)0.0;
       if (rg > 0) rginv = (flt_t)1.0/rg;
       if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
       if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
-      const flt_t rabinv = sqrt(ra2inv*rb2inv);
+      const flt_t rabinv = std::sqrt(ra2inv*rb2inv);
 
       flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
       const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
diff --git a/src/INTEL/dihedral_opls_intel.cpp b/src/INTEL/dihedral_opls_intel.cpp
index 82151a8585..52763e6bd8 100644
--- a/src/INTEL/dihedral_opls_intel.cpp
+++ b/src/INTEL/dihedral_opls_intel.cpp
@@ -195,15 +195,15 @@ void DihedralOPLSIntel::eval(const int vflag,
       // 1st and 2nd angle
 
       const flt_t b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
-      const flt_t rb1 = (flt_t)1.0 / sqrt(b1mag2);
+      const flt_t rb1 = (flt_t)1.0 / std::sqrt(b1mag2);
       const flt_t sb1 = (flt_t)1.0 / b1mag2;
 
       const flt_t b2mag2 = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
-      const flt_t rb2 = (flt_t)1.0 / sqrt(b2mag2);
+      const flt_t rb2 = (flt_t)1.0 / std::sqrt(b2mag2);
       const flt_t sb2 = (flt_t)1.0 / b2mag2;
 
       const flt_t b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
-      const flt_t rb3 = (flt_t)1.0 / sqrt(b3mag2);
+      const flt_t rb3 = (flt_t)1.0 / std::sqrt(b3mag2);
       const flt_t sb3 = (flt_t)1.0 / b3mag2;
 
       const flt_t c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
@@ -219,11 +219,11 @@ void DihedralOPLSIntel::eval(const int vflag,
       // cos and sin of 2 angles and final c
 
       flt_t sin2 = MAX((flt_t)1.0 - c1mag*c1mag,(flt_t)0.0);
-      flt_t sc1 = (flt_t)1.0/sqrt(sin2);
+      flt_t sc1 = (flt_t)1.0/std::sqrt(sin2);
       if (sin2 < SMALL2) sc1 = INVSMALL;
 
       sin2 = MAX((flt_t)1.0 - c2mag*c2mag,(flt_t)0.0);
-      flt_t sc2 = (flt_t)1.0/sqrt(sin2);
+      flt_t sc2 = (flt_t)1.0/std::sqrt(sin2);
       if (sin2 < SMALL2) sc2 = INVSMALL;
 
       const flt_t s1 = sc1 * sc1;
@@ -234,7 +234,7 @@ void DihedralOPLSIntel::eval(const int vflag,
       const flt_t cx = vb1z*vb2ym - vb1y*vb2zm;
       const flt_t cy = vb1x*vb2zm - vb1z*vb2xm;
       const flt_t cz = vb1y*vb2xm - vb1x*vb2ym;
-      const flt_t cmag = (flt_t)1.0/sqrt(cx*cx + cy*cy + cz*cz);
+      const flt_t cmag = (flt_t)1.0/std::sqrt(cx*cx + cy*cy + cz*cz);
       const flt_t dx = (cx*vb3x + cy*vb3y + cz*vb3z)*cmag*rb3;
 
       // error check
@@ -252,7 +252,7 @@ void DihedralOPLSIntel::eval(const int vflag,
 
       const flt_t cossq = c * c;
       const flt_t sinsq = (flt_t)1.0 - cossq;
-      flt_t siinv = (flt_t)1.0/sqrt(sinsq);
+      flt_t siinv = (flt_t)1.0/std::sqrt(sinsq);
       if (sinsq < SMALLER2 ) siinv = INVSMALLER;
       if (dx < (flt_t)0.0) siinv = -siinv;
 
diff --git a/src/INTEL/fix_intel.cpp b/src/INTEL/fix_intel.cpp
index 1ce0b41338..59224cc511 100644
--- a/src/INTEL/fix_intel.cpp
+++ b/src/INTEL/fix_intel.cpp
@@ -61,6 +61,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
   _hybrid_nonpair = 0;
   _print_pkg_info = 1;
   _nthreads = comm->nthreads;
+  _torque_flag = 0;
 
   _precision_mode = PREC_MODE_MIXED;
   _offload_balance = -1.0;
@@ -95,6 +96,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
   _allow_separate_buffers = 1;
   _offload_ghost = -1;
   _lrt = 0;
+  _p3m_table = 1;
 
   int iarg = 4;
   while (iarg < narg) {
@@ -135,11 +137,14 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
       if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
       _lrt = utils::logical(FLERR,arg[iarg+1],false,lmp);
       iarg += 2;
-    }
+    } else if (strcmp(arg[iarg], "pppm_table") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
+      _p3m_table = utils::logical(FLERR,arg[iarg+1],false,lmp);
+      iarg += 2;
 
     // undocumented options
 
-    else if (strcmp(arg[iarg],"offload_affinity_balanced") == 0) {
+    } else if (strcmp(arg[iarg],"offload_affinity_balanced") == 0) {
       _offload_affinity_balanced = 1;
       iarg++;
     } else if (strcmp(arg[iarg],"buffers") == 0) {
@@ -275,10 +280,8 @@ int FixIntel::setmask()
   int mask = 0;
   mask |= PRE_REVERSE;
   mask |= MIN_PRE_REVERSE;
-  #ifdef _LMP_INTEL_OFFLOAD
   mask |= POST_FORCE;
   mask |= MIN_POST_FORCE;
-  #endif
   mask |= POST_RUN;
   return mask;
 }
@@ -299,14 +302,29 @@ void FixIntel::init()
   }
   #endif
 
+  _torque_flag = 0;
+  if (force->pair_match("gayberne/intel$", 0)) _torque_flag = 1;
+
   const int nstyles = _pair_intel_count;
   if (force->pair_match("^hybrid", 0) != nullptr) {
     _pair_hybrid_flag = 1;
+
+    // Check if need to handle torque
+    auto hybrid = dynamic_cast<PairHybrid *>(force->pair);
+    for (int i = 0; i < hybrid->nstyles; i++)
+      if (utils::strmatch(hybrid->keywords[i],"/intel$") &&
+          utils::strmatch(hybrid->keywords[i],"gayberne"))
+        _torque_flag = 1;
+
     if (force->newton_pair != 0 && force->pair->no_virial_fdotr_compute)
       error->all(FLERR,"INTEL package requires fdotr virial with newton on.");
   } else
     _pair_hybrid_flag = 0;
 
+  if (_torque_flag && nstyles > 1)
+    error->all(FLERR,"gayberne/intel style cannot yet be used with other "
+               "intel pair styles.");
+
   if (nstyles > 1 && _pair_hybrid_flag) _pair_hybrid_flag = 2;
   else if (force->newton_pair == 0) _pair_hybrid_flag = 0;
 
@@ -333,14 +351,17 @@ void FixIntel::init()
   int off_mode = 0;
   if (_offload_balance != 0.0) off_mode = 1;
   if (_precision_mode == PREC_MODE_SINGLE) {
+    _single_buffers->set_torque_flag(_torque_flag);
     _single_buffers->zero_ev();
     _single_buffers->grow_ncache(off_mode, comm->nthreads);
     _single_buffers->free_list_ptrs();
   } else if (_precision_mode == PREC_MODE_MIXED) {
+    _mixed_buffers->set_torque_flag(_torque_flag);
     _mixed_buffers->zero_ev();
     _mixed_buffers->grow_ncache(off_mode, comm->nthreads);
     _mixed_buffers->free_list_ptrs();
   } else {
+    _double_buffers->set_torque_flag(_torque_flag);
     _double_buffers->zero_ev();
     _double_buffers->grow_ncache(off_mode, comm->nthreads);
     _double_buffers->free_list_ptrs();
@@ -390,7 +411,7 @@ bool FixIntel::pair_hybrid_check()
 void FixIntel::pair_init_check(const bool cdmessage)
 {
   #ifdef INTEL_VMASK
-  atom->sortfreq = 1;
+  if (atom->sortfreq) atom->sortfreq = 1;
   #endif
 
   _nbor_pack_width = 1;
@@ -597,6 +618,19 @@ void FixIntel::pre_reverse(int /*eflag*/, int /*vflag*/)
 
 /* ---------------------------------------------------------------------- */
 
+void FixIntel::post_force(int vflag)
+{
+  // Redundant call to sync Intel data structs with native for methods that
+  // call force compute but do not call prereverse
+  _sync_main_arrays(1);
+
+  #ifdef LMP_INTEL_OFFLOAD
+  if (_sync_mode == 2) sync_coprocessor();
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
 template <class acc_t>
 void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
 {
@@ -605,7 +639,7 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
     o_range = atom->nlocal + atom->nghost;
   else
     o_range = atom->nlocal;
-  IP_PRE_get_stride(f_stride, o_range, (sizeof(acc_t)*4), lmp->atom->torque);
+  IP_PRE_get_stride(f_stride, o_range, (sizeof(acc_t)*4), _torque_flag);
 
   o_range *= 4;
   const int f_stride4 = f_stride * 4;
@@ -704,7 +738,7 @@ void FixIntel::add_results(const ft * _noalias const f_in,
         add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
         const acc_t * _noalias const enull = 0;
         int offset = _offload_nlocal;
-        if (atom->torque) offset *= 2;
+        if (_torque_flag) offset *= 2;
         add_oresults(f_in + offset, enull, eatom, vatom,
                      _offload_min_ghost, _offload_nghost);
       } else
@@ -715,7 +749,7 @@ void FixIntel::add_results(const ft * _noalias const f_in,
                      _host_min_local, _host_used_local);
         const acc_t * _noalias const enull = 0;
         int offset = _host_used_local;
-        if (atom->torque) offset *= 2;
+        if (_torque_flag) offset *= 2;
         add_oresults(f_in + offset, enull, eatom,
                      vatom, _host_min_ghost, _host_used_ghost);
       } else {
@@ -764,7 +798,7 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
                             const int eatom, const int /*vatom*/,
                             const int out_offset, const int nall) {
   lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
-  if (atom->torque) {
+  if (_torque_flag) {
     if (f_in[1].w)
     {
       if (f_in[1].w == 1)
@@ -788,7 +822,7 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
     #endif
     int ifrom, ito;
     IP_PRE_omp_range_align(ifrom, ito, tid, nall, packthreads, sizeof(acc_t));
-    if (atom->torque) {
+    if (_torque_flag) {
       int ii = ifrom * 2;
       lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
         out_offset;
@@ -883,13 +917,6 @@ double FixIntel::memory_usage()
 
 /* ---------------------------------------------------------------------- */
 
-void FixIntel::post_force(int vflag)
-{
-  if (_sync_mode == 2) sync_coprocessor();
-}
-
-/* ---------------------------------------------------------------------- */
-
 template <class ft, class acc_t>
 void FixIntel::add_off_results(const ft * _noalias const f_in,
                                const acc_t * _noalias const ev_global) {
@@ -919,7 +946,7 @@ void FixIntel::add_off_results(const ft * _noalias const f_in,
       _offload_nlocal;
   }
 
-  if (atom->torque)
+  if (_torque_flag)
     if (f_in[1].w < 0.0)
       error->all(FLERR,"Bad matrix inversion in mldivide3");
   add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);
diff --git a/src/INTEL/fix_intel.h b/src/INTEL/fix_intel.h
index 9214fe3419..08d0a534d9 100644
--- a/src/INTEL/fix_intel.h
+++ b/src/INTEL/fix_intel.h
@@ -55,6 +55,7 @@ class FixIntel : public Fix {
   void pre_reverse(int eflag = 0, int vflag = 0) override;
   inline void min_pre_reverse(int eflag = 0, int vflag = 0) override { pre_reverse(eflag, vflag); }
 
+  void post_force(int vflag) override;
   void post_run() override { _print_pkg_info = 1; }
 
   // Get all forces, calculation results from coprocesser
@@ -102,7 +103,7 @@ class FixIntel : public Fix {
   inline int pppm_table()
   {
     if (force->kspace_match("^pppm/.*intel$", 0))
-      return INTEL_P3M_TABLE;
+      return _p3m_table;
     else
       return 0;
   }
@@ -115,7 +116,7 @@ class FixIntel : public Fix {
   int _precision_mode, _nthreads, _nbor_pack_width, _three_body_neighbor;
   int _pair_intel_count, _pair_hybrid_flag, _print_pkg_info;
   // These should be removed in subsequent update w/ simpler hybrid arch
-  int _pair_hybrid_zero, _hybrid_nonpair, _zero_master;
+  int _pair_hybrid_zero, _hybrid_nonpair, _zero_master, _torque_flag;
 
  public:
   inline int *get_overflow_flag() { return _overflow_flag; }
@@ -132,7 +133,6 @@ class FixIntel : public Fix {
   inline void get_buffern(const int offload, int &nlocal, int &nall, int &minlocal);
 
 #ifdef _LMP_INTEL_OFFLOAD
-  void post_force(int vflag);
   inline int coprocessor_number() { return _cop; }
   inline int full_host_list() { return _full_host_list; }
   void set_offload_affinity();
@@ -194,7 +194,7 @@ class FixIntel : public Fix {
  protected:
   int _overflow_flag[5];
   _alignvar(int _off_overflow_flag[5], 64);
-  int _allow_separate_buffers, _offload_ghost, _lrt;
+  int _allow_separate_buffers, _offload_ghost, _lrt, _p3m_table;
 
   IntelBuffers<float, float>::vec3_acc_t *_force_array_s;
   IntelBuffers<float, double>::vec3_acc_t *_force_array_m;
diff --git a/src/INTEL/fix_nh_intel.cpp b/src/INTEL/fix_nh_intel.cpp
index 87ce0dbe43..2c05c3f6fa 100644
--- a/src/INTEL/fix_nh_intel.cpp
+++ b/src/INTEL/fix_nh_intel.cpp
@@ -166,28 +166,28 @@ void FixNHIntel::remap()
   if (pstyle == TRICLINIC) {
 
     if (p_flag[4]) {
-      expfac = exp(dto8*omega_dot[0]);
+      expfac = std::exp(dto8*omega_dot[0]);
       h[4] *= expfac;
       h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]);
       h[4] *= expfac;
     }
 
     if (p_flag[3]) {
-      expfac = exp(dto4*omega_dot[1]);
+      expfac = std::exp(dto4*omega_dot[1]);
       h[3] *= expfac;
       h[3] += dto2*(omega_dot[3]*h[2]);
       h[3] *= expfac;
     }
 
     if (p_flag[5]) {
-      expfac = exp(dto4*omega_dot[0]);
+      expfac = std::exp(dto4*omega_dot[0]);
       h[5] *= expfac;
       h[5] += dto2*(omega_dot[5]*h[1]);
       h[5] *= expfac;
     }
 
     if (p_flag[4]) {
-      expfac = exp(dto8*omega_dot[0]);
+      expfac = std::exp(dto8*omega_dot[0]);
       h[4] *= expfac;
       h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]);
       h[4] *= expfac;
@@ -200,7 +200,7 @@ void FixNHIntel::remap()
   if (p_flag[0]) {
     oldlo = domain->boxlo[0];
     oldhi = domain->boxhi[0];
-    expfac = exp(dto*omega_dot[0]);
+    expfac = std::exp(dto*omega_dot[0]);
     domain->boxlo[0] = (oldlo-fixedpoint[0])*expfac + fixedpoint[0];
     domain->boxhi[0] = (oldhi-fixedpoint[0])*expfac + fixedpoint[0];
   }
@@ -208,7 +208,7 @@ void FixNHIntel::remap()
   if (p_flag[1]) {
     oldlo = domain->boxlo[1];
     oldhi = domain->boxhi[1];
-    expfac = exp(dto*omega_dot[1]);
+    expfac = std::exp(dto*omega_dot[1]);
     domain->boxlo[1] = (oldlo-fixedpoint[1])*expfac + fixedpoint[1];
     domain->boxhi[1] = (oldhi-fixedpoint[1])*expfac + fixedpoint[1];
     if (scalexy) h[5] *= expfac;
@@ -217,7 +217,7 @@ void FixNHIntel::remap()
   if (p_flag[2]) {
     oldlo = domain->boxlo[2];
     oldhi = domain->boxhi[2];
-    expfac = exp(dto*omega_dot[2]);
+    expfac = std::exp(dto*omega_dot[2]);
     domain->boxlo[2] = (oldlo-fixedpoint[2])*expfac + fixedpoint[2];
     domain->boxhi[2] = (oldhi-fixedpoint[2])*expfac + fixedpoint[2];
     if (scalexz) h[4] *= expfac;
@@ -229,28 +229,28 @@ void FixNHIntel::remap()
   if (pstyle == TRICLINIC) {
 
     if (p_flag[4]) {
-      expfac = exp(dto8*omega_dot[0]);
+      expfac = std::exp(dto8*omega_dot[0]);
       h[4] *= expfac;
       h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]);
       h[4] *= expfac;
     }
 
     if (p_flag[3]) {
-      expfac = exp(dto4*omega_dot[1]);
+      expfac = std::exp(dto4*omega_dot[1]);
       h[3] *= expfac;
       h[3] += dto2*(omega_dot[3]*h[2]);
       h[3] *= expfac;
     }
 
     if (p_flag[5]) {
-      expfac = exp(dto4*omega_dot[0]);
+      expfac = std::exp(dto4*omega_dot[0]);
       h[5] *= expfac;
       h[5] += dto2*(omega_dot[5]*h[1]);
       h[5] *= expfac;
     }
 
     if (p_flag[4]) {
-      expfac = exp(dto8*omega_dot[0]);
+      expfac = std::exp(dto8*omega_dot[0]);
       h[4] *= expfac;
       h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]);
       h[4] *= expfac;
@@ -427,9 +427,9 @@ void FixNHIntel::nh_v_press()
   int nlocal = atom->nlocal;
   if (igroup == atom->firstgroup) nlocal = atom->nfirst;
 
-  double f0 = exp(-dt4*(omega_dot[0]+mtk_term2));
-  double f1 = exp(-dt4*(omega_dot[1]+mtk_term2));
-  double f2 = exp(-dt4*(omega_dot[2]+mtk_term2));
+  double f0 = std::exp(-dt4*(omega_dot[0]+mtk_term2));
+  double f1 = std::exp(-dt4*(omega_dot[1]+mtk_term2));
+  double f2 = std::exp(-dt4*(omega_dot[2]+mtk_term2));
   f0 *= f0;
   f1 *= f1;
   f2 *= f2;
diff --git a/src/INTEL/improper_cvff_intel.cpp b/src/INTEL/improper_cvff_intel.cpp
index 09b2d42167..a503f45541 100644
--- a/src/INTEL/improper_cvff_intel.cpp
+++ b/src/INTEL/improper_cvff_intel.cpp
@@ -191,15 +191,15 @@ void ImproperCvffIntel::eval(const int vflag,
       // 1st and 2nd angle
 
       const flt_t b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
-      const flt_t rb1 = (flt_t)1.0 / sqrt(b1mag2);
+      const flt_t rb1 = (flt_t)1.0 / std::sqrt(b1mag2);
       const flt_t sb1 = (flt_t)1.0 / b1mag2;
 
       const flt_t b2mag2 = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
-      const flt_t rb2 = (flt_t)1.0 / sqrt(b2mag2);
+      const flt_t rb2 = (flt_t)1.0 / std::sqrt(b2mag2);
       const flt_t sb2 = (flt_t)1.0 / b2mag2;
 
       const flt_t b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
-      const flt_t rb3 = (flt_t)1.0 / sqrt(b3mag2);
+      const flt_t rb3 = (flt_t)1.0 / std::sqrt(b3mag2);
       const flt_t sb3 = (flt_t)1.0 / b3mag2;
 
       const flt_t c0 = (vb1x * vb3x + vb1y * vb3y + vb1z * vb3z) * rb1 * rb3;
@@ -215,11 +215,11 @@ void ImproperCvffIntel::eval(const int vflag,
       // cos and sin of 2 angles and final c
 
       const flt_t sd1 = (flt_t)1.0 - c1mag * c1mag;
-      flt_t sc1 = (flt_t)1.0/sqrt(sd1);
+      flt_t sc1 = (flt_t)1.0/std::sqrt(sd1);
       if (sd1 < SMALL2) sc1 = INVSMALL;
 
       const flt_t sd2 = (flt_t)1.0 - c2mag * c2mag;
-      flt_t sc2 = (flt_t)1.0/sqrt(sd2);
+      flt_t sc2 = (flt_t)1.0/std::sqrt(sd2);
       if (sc2 < SMALL2) sc2 = INVSMALL;
 
       const flt_t s1 = sc1 * sc1;
diff --git a/src/INTEL/improper_harmonic_intel.cpp b/src/INTEL/improper_harmonic_intel.cpp
index a9a152d5f0..869051ab6b 100644
--- a/src/INTEL/improper_harmonic_intel.cpp
+++ b/src/INTEL/improper_harmonic_intel.cpp
@@ -194,9 +194,9 @@ void ImproperHarmonicIntel::eval(const int vflag,
       flt_t ss2 = vb2x*vb2x + vb2y*vb2y + vb2z*vb2z;
       flt_t ss3 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
 
-      const flt_t r1 = (flt_t)1.0 / sqrt(ss1);
-      const flt_t r2 = (flt_t)1.0 / sqrt(ss2);
-      const flt_t r3 = (flt_t)1.0 / sqrt(ss3);
+      const flt_t r1 = (flt_t)1.0 / std::sqrt(ss1);
+      const flt_t r2 = (flt_t)1.0 / std::sqrt(ss2);
+      const flt_t r3 = (flt_t)1.0 / std::sqrt(ss3);
 
       ss1 = (flt_t)1.0 / ss1;
       ss2 = (flt_t)1.0 / ss2;
@@ -214,7 +214,7 @@ void ImproperHarmonicIntel::eval(const int vflag,
       flt_t s2 = (flt_t)1.0 - c2*c2;
       if (s2 < SMALL) s2 = SMALL;
 
-      flt_t s12 = (flt_t)1.0 / sqrt(s1*s2);
+      flt_t s12 = (flt_t)1.0 / std::sqrt(s1*s2);
       s1 = (flt_t)1.0 / s1;
       s2 = (flt_t)1.0 / s2;
       flt_t c = (c1*c2 + c0) * s12;
@@ -229,12 +229,12 @@ void ImproperHarmonicIntel::eval(const int vflag,
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
 
       const flt_t sd = (flt_t)1.0 - c * c;
-      flt_t s = (flt_t)1.0 / sqrt(sd);
+      flt_t s = (flt_t)1.0 / std::sqrt(sd);
       if (sd < SMALL2) s = INVSMALL;
 
       // force & energy
 
-      const flt_t domega = acos(c) - fc.fc[type].chi;
+      const flt_t domega = std::acos(c) - fc.fc[type].chi;
       flt_t a;
       a = fc.fc[type].k * domega;
 
diff --git a/src/INTEL/intel_buffers.cpp b/src/INTEL/intel_buffers.cpp
index bb54577097..6d4c9bffcd 100644
--- a/src/INTEL/intel_buffers.cpp
+++ b/src/INTEL/intel_buffers.cpp
@@ -28,6 +28,7 @@ template <class flt_t, class acc_t>
 IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
     lmp(lmp_in), _x(nullptr), _q(nullptr), _quat(nullptr), _f(nullptr), _off_threads(0),
     _n_list_ptrs(1), _max_list_ptrs(4), _buf_size(0), _buf_local_size(0) {
+  _torque_flag = 0;
   _neigh_list_ptrs = new IntelNeighListPtrs[_max_list_ptrs];
   _neigh_list_ptrs[0].cnumneigh = nullptr;
   _list_alloc_atoms = 0;
@@ -695,7 +696,7 @@ double IntelBuffers<flt_t, acc_t>::memory_usage(const int nthreads)
 {
   double tmem = sizeof(atom_t);
   if (lmp->atom->q) tmem += sizeof(flt_t);
-  if (lmp->atom->torque) tmem += sizeof(quat_t);
+  if (_torque_flag) tmem += sizeof(quat_t);
   #ifdef _LMP_INTEL_OFFLOAD
   if (_separate_buffers) tmem *= 2;
   #endif
diff --git a/src/INTEL/intel_buffers.h b/src/INTEL/intel_buffers.h
index 7422850184..1e9739c3b2 100644
--- a/src/INTEL/intel_buffers.h
+++ b/src/INTEL/intel_buffers.h
@@ -57,7 +57,7 @@ class IntelBuffers {
   inline int get_stride(int nall) {
     int stride;
     IP_PRE_get_stride(stride, nall, sizeof(vec3_acc_t),
-                         lmp->atom->torque);
+                      _torque_flag);
     return stride;
   }
 
@@ -76,6 +76,8 @@ class IntelBuffers {
     _neigh_list_ptrs[0].numneighhalf = atombin;
   }
 
+  inline void set_torque_flag(const int in) { _torque_flag = in; }
+
   inline void grow(const int nall, const int nlocal, const int nthreads,
                    const int offload_end) {
     if (nall >= _buf_size || nlocal >= _buf_local_size)
@@ -329,7 +331,7 @@ class IntelBuffers {
   flt_t *_q;
   quat_t *_quat;
   vec3_acc_t * _f;
-  int _off_threads, _off_map_listlocal;
+  int _torque_flag, _off_threads, _off_map_listlocal;
 
   int _list_alloc_atoms;
   int *_list_alloc, *_cnumneigh, *_atombin, *_binpacked;
diff --git a/src/INTEL/intel_intrinsics.h b/src/INTEL/intel_intrinsics.h
index 1ff405edd2..c106dfd411 100644
--- a/src/INTEL/intel_intrinsics.h
+++ b/src/INTEL/intel_intrinsics.h
@@ -1764,7 +1764,7 @@ struct vector_ops<flt_t, NONE> {
       return a < b;
     }
     static fvec invsqrt(const fvec &a) {
-      return 1. / sqrt(a);
+      return 1. / std::sqrt(a);
     }
     static fvec sincos(fvec *c, const fvec &a) {
       *c = cos(a);
diff --git a/src/INTEL/intel_preprocess.h b/src/INTEL/intel_preprocess.h
index a3c961f436..2c4b9a0c1b 100644
--- a/src/INTEL/intel_preprocess.h
+++ b/src/INTEL/intel_preprocess.h
@@ -86,8 +86,6 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #define INTEL_MAX_STENCIL_CHECK 4096
 #define INTEL_P3M_MAXORDER 8
 #define INTEL_P3M_ALIGNED_MAXORDER 8
-// PRECOMPUTE VALUES IN TABLE (DOESN'T AFFECT ACCURACY)
-#define INTEL_P3M_TABLE 1
 
 #ifdef __INTEL_COMPILER
 #ifdef __AVX__
diff --git a/src/INTEL/math_extra_intel.h b/src/INTEL/math_extra_intel.h
index a8869f7384..556f3f4590 100644
--- a/src/INTEL/math_extra_intel.h
+++ b/src/INTEL/math_extra_intel.h
@@ -100,12 +100,12 @@
    normalize a vector, return in ans
 ------------------------------------------------------------------------- */
 
-#define ME_normalize3(v0, v1, v2, ans)                  \
-{                                                       \
-  flt_t scale = (flt_t)1.0 / sqrt(v0*v0+v1*v1+v2*v2);   \
-  ans##_0 = v0 * scale;                                 \
-  ans##_1 = v1 * scale;                                 \
-  ans##_2 = v2 * scale;                                 \
+#define ME_normalize3(v0, v1, v2, ans)                          \
+{                                                               \
+  flt_t scale = (flt_t)1.0 / std::sqrt(v0*v0+v1*v1+v2*v2);      \
+  ans##_0 = v0 * scale;                                         \
+  ans##_1 = v1 * scale;                                         \
+  ans##_2 = v2 * scale;                                         \
 }
 
 /* ----------------------------------------------------------------------
@@ -359,7 +359,7 @@
 #define ME_qnormalize(q)                                                \
 {                                                                       \
   double norm = 1.0 /                                                   \
-    sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k);        \
+    std::sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k);   \
   q##_w *= norm;                                                        \
   q##_i *= norm;                                                        \
   q##_j *= norm;                                                        \
diff --git a/src/INTEL/npair_full_bin_ghost_intel.cpp b/src/INTEL/npair_full_bin_ghost_intel.cpp
index b7b9ee4aea..381db62c08 100644
--- a/src/INTEL/npair_full_bin_ghost_intel.cpp
+++ b/src/INTEL/npair_full_bin_ghost_intel.cpp
@@ -292,12 +292,13 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
       else ito -= wlocal - nlocal;
 
       int e_ito = ito;
-      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
+      const bigint list_size = (bigint)(e_ito + tid * 2 + 2) *
+        (bigint)maxnbors;
 
       int pack_offset = maxnbors;
-      int ct = (ifrom + tid * 2) * maxnbors;
+      bigint ct = (bigint)(ifrom + tid * 2) * (bigint)maxnbors;
       int * _noalias neighptr = intel_list + ct;
-      const int obound = pack_offset + maxnbors * 2;
+      const bigint obound = pack_offset + maxnbors * 2;
 
       const int toffs = tid * ncache_stride;
       flt_t * _noalias const tx = ncachex + toffs;
diff --git a/src/INTEL/npair_intel.cpp b/src/INTEL/npair_intel.cpp
index b85873f677..8b05aa38aa 100644
--- a/src/INTEL/npair_intel.cpp
+++ b/src/INTEL/npair_intel.cpp
@@ -287,16 +287,17 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
         if (imod) e_ito += pack_width - imod;
       }
       #endif
-      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
+      const bigint list_size = (bigint)(e_ito + tid * 2 + 2) *
+        (bigint)maxnbors;
 
       #ifdef LMP_INTEL_3BODY_FAST
       const int pack_offset = maxnbors * pack_width;
-      const int obound = pack_offset + maxnbors * 2;
+      const bigint obound = pack_offset + maxnbors * 2;
       #else
       const int pack_offset = 0;
-      const int obound = maxnbors * 3;
+      const bigint obound = maxnbors * 3;
       #endif
-      int ct = (ifrom + tid * 2) * maxnbors;
+      bigint ct = (bigint)(ifrom + tid * 2) * (bigint)maxnbors;
       int * _noalias neighptr = intel_list + ct;
       int * _noalias neighptr2;
       if (THREE) neighptr2 = neighptr;
diff --git a/src/INTEL/pair_airebo_intel.cpp b/src/INTEL/pair_airebo_intel.cpp
index 0b05447dd7..7bc2b3edb8 100644
--- a/src/INTEL/pair_airebo_intel.cpp
+++ b/src/INTEL/pair_airebo_intel.cpp
@@ -905,7 +905,7 @@ inline flt_t frebo_pij(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i, int j,
       flt_t rho_k = ka->params.rho[ktype][1];
       flt_t rho_j = ka->params.rho[jtype][1];
       flt_t lamdajik = 4 * itype * ((rho_k - rikmag) - (rho_j - rijmag));
-      flt_t ex_lam = exp(lamdajik);
+      flt_t ex_lam = overloaded::exp(lamdajik);
       flt_t rcminik = ka->params.rcmin[itype][ktype];
       flt_t rcmaxik = ka->params.rcmax[itype][ktype];
       flt_t dwik;
diff --git a/src/INTEL/pair_buck_coul_cut_intel.cpp b/src/INTEL/pair_buck_coul_cut_intel.cpp
index 01a9f91bcf..424e38c16a 100644
--- a/src/INTEL/pair_buck_coul_cut_intel.cpp
+++ b/src/INTEL/pair_buck_coul_cut_intel.cpp
@@ -267,7 +267,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
           const flt_t delz = ztmp - x[j].z;
           const int jtype = IP_PRE_dword_index(x[j].w);
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
-          const flt_t r = sqrt(rsq);
+          const flt_t r = std::sqrt(rsq);
           const flt_t r2inv = (flt_t)1.0 / rsq;
 
           #ifdef INTEL_VMASK
diff --git a/src/INTEL/pair_buck_coul_long_intel.cpp b/src/INTEL/pair_buck_coul_long_intel.cpp
index ff57b571a4..48007c30d9 100644
--- a/src/INTEL/pair_buck_coul_long_intel.cpp
+++ b/src/INTEL/pair_buck_coul_long_intel.cpp
@@ -322,7 +322,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
           const int jtype = tjtype[jj];
           const flt_t rsq = trsq[jj];
           const flt_t r2inv = (flt_t)1.0 / rsq;
-          const flt_t r = (flt_t)1.0 / sqrt(r2inv);
+          const flt_t r = (flt_t)1.0 / std::sqrt(r2inv);
 
           #ifdef INTEL_ALLOW_TABLE
           if (!ncoultablebits || rsq <= tabinnersq) {
diff --git a/src/INTEL/pair_buck_intel.cpp b/src/INTEL/pair_buck_intel.cpp
index 2461361788..1f2a033784 100644
--- a/src/INTEL/pair_buck_intel.cpp
+++ b/src/INTEL/pair_buck_intel.cpp
@@ -255,7 +255,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
           const flt_t delz = ztmp - x[j].z;
           const int jtype = IP_PRE_dword_index(x[j].w);
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
-          const flt_t r = sqrt(rsq);
+          const flt_t r = std::sqrt(rsq);
           const flt_t r2inv = (flt_t)1.0 / rsq;
 
           #ifdef INTEL_VMASK
diff --git a/src/INTEL/pair_dpd_intel.cpp b/src/INTEL/pair_dpd_intel.cpp
index 011979223e..7b11053d41 100644
--- a/src/INTEL/pair_dpd_intel.cpp
+++ b/src/INTEL/pair_dpd_intel.cpp
@@ -180,7 +180,7 @@ void PairDPDIntel::eval(const int offload, const int vflag,
   ATOM_T * _noalias const x = buffers->get_x(offload);
   typedef struct { double x, y, z; } lmp_vt;
   auto *v = (lmp_vt *)atom->v[0];
-  const flt_t dtinvsqrt = 1.0/sqrt(update->dt);
+  const flt_t dtinvsqrt = 1.0/std::sqrt(update->dt);
 
   const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
@@ -322,7 +322,7 @@ void PairDPDIntel::eval(const int offload, const int vflag,
             icut = parami[jtype].icut;
           }
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
-          const flt_t rinv = (flt_t)1.0/sqrt(rsq);
+          const flt_t rinv = (flt_t)1.0/std::sqrt(rsq);
 
           if (rinv > icut) {
             flt_t factor_dpd, factor_sqrt;
diff --git a/src/INTEL/pair_eam_intel.cpp b/src/INTEL/pair_eam_intel.cpp
index ba5047a15a..9c5d6da5e5 100644
--- a/src/INTEL/pair_eam_intel.cpp
+++ b/src/INTEL/pair_eam_intel.cpp
@@ -340,7 +340,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
           const int j = tj[jj] & NEIGHMASK;
           if (!ONETYPE) jtype = tjtype[jj];
           const flt_t rsq = trsq[jj];
-          flt_t p = sqrt(rsq)*frdr + (flt_t)1.0;
+          flt_t p = std::sqrt(rsq)*frdr + (flt_t)1.0;
           int m = static_cast<int> (p);
           m = MIN(m,nr-1);
           p -= m;
@@ -546,7 +546,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
           const int j = tj[jj] & NEIGHMASK;
           if (!ONETYPE) jtype = tjtype[jj];
           const flt_t rsq = trsq[jj];
-          const flt_t r = sqrt(rsq);
+          const flt_t r = std::sqrt(rsq);
           flt_t p = r*frdr + (flt_t)1.0;
           int m = static_cast<int> (p);
           m = MIN(m,nr-1);
diff --git a/src/INTEL/pair_gayberne_intel.cpp b/src/INTEL/pair_gayberne_intel.cpp
index 5609479388..92e074e5e1 100644
--- a/src/INTEL/pair_gayberne_intel.cpp
+++ b/src/INTEL/pair_gayberne_intel.cpp
@@ -492,7 +492,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 
           flt_t r12hat_0, r12hat_1, r12hat_2;
           ME_normalize3(delx_form[jj], dely_form[jj], delz_form[jj], r12hat);
-          flt_t r = sqrt(rsq_form[jj]);
+          flt_t r = std::sqrt(rsq_form[jj]);
 
           // compute distance of closest approach
 
diff --git a/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp b/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
index e920257ef4..faae6e5cbc 100644
--- a/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
+++ b/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
@@ -306,7 +306,7 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
           const int sbindex = tj[jj] >> SBBITS & 3;
           const flt_t rsq = trsq[jj];
           const flt_t r2inv = (flt_t)1.0 / rsq;
-          const flt_t r_inv = (flt_t)1.0 / sqrt(rsq);
+          const flt_t r_inv = (flt_t)1.0 / std::sqrt(rsq);
           forcecoul = qqrd2e * qtmp * q[j] * r_inv;
           if (rsq > cut_coul_innersq) {
             const flt_t ccr = cut_coulsq - rsq;
diff --git a/src/INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
index 0a93621bcd..412af17357 100644
--- a/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
@@ -339,7 +339,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
             const flt_t EWALD_F = 1.12837917;
             const flt_t INV_EWALD_P = 1.0 / 0.3275911;
 
-            const flt_t r = (flt_t)1.0 / sqrt(r2inv);
+            const flt_t r = (flt_t)1.0 / std::sqrt(r2inv);
             const flt_t grij = g_ewald * r;
             const flt_t expm2 = std::exp(-grij * grij);
             const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
@@ -591,10 +591,10 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
     for (int j = 1; j < tp1; j++) {
       if (i <= j) {
         fc.lj[i][j].x = epsilon[i][j] * 4.0;
-        fc.lj[i][j].y = pow(sigma[i][j],6.0);
+        fc.lj[i][j].y = std::pow(sigma[i][j],6.0);
       } else {
         fc.lj[i][j].x = epsilon[j][i] * 4.0;
-        fc.lj[i][j].y = pow(sigma[j][i],6.0);
+        fc.lj[i][j].y = std::pow(sigma[j][i],6.0);
       }
       fc.cutsq[i][j] = cutsq[i][j];
     }
diff --git a/src/INTEL/pair_lj_cut_coul_long_intel.cpp b/src/INTEL/pair_lj_cut_coul_long_intel.cpp
index 89e556defa..5eb3cbc1f7 100644
--- a/src/INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/INTEL/pair_lj_cut_coul_long_intel.cpp
@@ -332,7 +332,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
             const flt_t EWALD_F = 1.12837917;
             const flt_t INV_EWALD_P = 1.0 / 0.3275911;
 
-            const flt_t r = (flt_t)1.0 / sqrt(r2inv);
+            const flt_t r = (flt_t)1.0 / std::sqrt(r2inv);
             const flt_t grij = g_ewald * r;
             const flt_t expm2 = std::exp(-grij * grij);
             const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
diff --git a/src/INTEL/pair_sw_intel.cpp b/src/INTEL/pair_sw_intel.cpp
index 07d590ee2c..fa62f499de 100644
--- a/src/INTEL/pair_sw_intel.cpp
+++ b/src/INTEL/pair_sw_intel.cpp
@@ -382,7 +382,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
           const flt_t rsq1 = trsq[jj];
 
           const flt_t rinvsq1 = (flt_t)1.0 / rsq1;
-          const flt_t r1 = (flt_t)1.0/sqrt(rinvsq1);
+          const flt_t r1 = (flt_t)1.0/std::sqrt(rinvsq1);
           if (!ONETYPE) cut = p2f[ijtype].cut;
           const flt_t rainv1 = (flt_t)1.0 / (r1 - cut);
 
@@ -475,7 +475,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
             const flt_t rsq2 = trsq[kk];
 
             const flt_t rinvsq2 = (flt_t)1.0 / rsq2;
-            const flt_t r2 = (flt_t)1.0 / sqrt(rinvsq2);
+            const flt_t r2 = (flt_t)1.0 / std::sqrt(rinvsq2);
             const flt_t rainv2 = (flt_t)1.0 / (r2 - cut);
             const flt_t gsrainv2 = sigma_gamma * rainv2;
             const flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2;
diff --git a/src/INTEL/pppm_disp_intel.cpp b/src/INTEL/pppm_disp_intel.cpp
index 319d070e66..cc4aa6ab93 100644
--- a/src/INTEL/pppm_disp_intel.cpp
+++ b/src/INTEL/pppm_disp_intel.cpp
@@ -657,8 +657,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
 
     energy_1 -= g_ewald*qsqsum/MY_PIS +
       MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
-    energy_6 += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij +
-      1.0/12.0*pow(g_ewald_6,6)*csum;
+    energy_6 += - MY_PI*MY_PIS/(6*volume)*std::pow(g_ewald_6,3)*csumij +
+      1.0/12.0*std::pow(g_ewald_6,6)*csum;
     energy_1 *= qscale;
   }
 
@@ -671,7 +671,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
     MPI_Allreduce(virial_6,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
     for (i = 0; i < 6; i++) virial[i] += 0.5*volume*virial_all[i];
     if (function[1]+function[2]+function[3]) {
-      double a =  MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij;
+      double a =  MY_PI*MY_PIS/(6*volume)*std::pow(g_ewald_6,3)*csumij;
       virial[0] -= a;
       virial[1] -= a;
       virial[2] -= a;
@@ -690,8 +690,8 @@ void PPPMDispIntel::compute(int eflag, int vflag)
       int tmp;
       for (i = 0; i < atom->nlocal; i++) {
         tmp = atom->type[i];
-        eatom[i] += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumi[tmp] +
-                      1.0/12.0*pow(g_ewald_6,6)*cii[tmp];
+        eatom[i] += - MY_PI*MY_PIS/(6*volume)*std::pow(g_ewald_6,3)*
+          csumi[tmp] + 1.0/12.0*std::pow(g_ewald_6,6)*cii[tmp];
       }
     }
   }
@@ -703,7 +703,7 @@ void PPPMDispIntel::compute(int eflag, int vflag)
         tmp = atom->type[i];
         //dispersion self virial correction
         for (int n = 0; n < 3; n++) vatom[i][n] -= MY_PI*MY_PIS/(6*volume)*
-                                      pow(g_ewald_6,3)*csumi[tmp];
+                                      std::pow(g_ewald_6,3)*csumi[tmp];
       }
     }
   }
@@ -1783,18 +1783,18 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
       const flt_t s1 = x[i][0] * hx_inv;
       const flt_t s2 = x[i][1] * hy_inv;
       const flt_t s3 = x[i][2] * hz_inv;
-      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
-      sf += fsf_coeff1 * sin(ffour_pi * s1);
+      flt_t sf = fsf_coeff0 * std::sin(ftwo_pi * s1);
+      sf += fsf_coeff1 * std::sin(ffour_pi * s1);
       sf *= twoqsq;
       f[i][0] += qfactor * particle_ekx[i] - fqqrd2es * sf;
 
-      sf = fsf_coeff2 * sin(ftwo_pi * s2);
-      sf += fsf_coeff3 * sin(ffour_pi * s2);
+      sf = fsf_coeff2 * std::sin(ftwo_pi * s2);
+      sf += fsf_coeff3 * std::sin(ffour_pi * s2);
       sf *= twoqsq;
       f[i][1] += qfactor * particle_eky[i] - fqqrd2es * sf;
 
-      sf = fsf_coeff4 * sin(ftwo_pi * s3);
-      sf += fsf_coeff5 * sin(ffour_pi * s3);
+      sf = fsf_coeff4 * std::sin(ftwo_pi * s3);
+      sf += fsf_coeff5 * std::sin(ffour_pi * s3);
       sf *= twoqsq;
 
       if (slabflag != 2) f[i][2] += qfactor * particle_ekz[i] - fqqrd2es * sf;
@@ -2155,18 +2155,18 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
       const flt_t s1 = x[i][0] * hx_inv;
       const flt_t s2 = x[i][1] * hy_inv;
       const flt_t s3 = x[i][2] * hz_inv;
-      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
-      sf += fsf_coeff1 * sin(ffour_pi * s1);
+      flt_t sf = fsf_coeff0 * std::sin(ftwo_pi * s1);
+      sf += fsf_coeff1 * std::sin(ffour_pi * s1);
       sf *= twoljsq;
       f[i][0] += lj * particle_ekx[i] - sf;
 
-      sf = fsf_coeff2 * sin(ftwo_pi * s2);
-      sf += fsf_coeff3 * sin(ffour_pi * s2);
+      sf = fsf_coeff2 * std::sin(ftwo_pi * s2);
+      sf += fsf_coeff3 * std::sin(ffour_pi * s2);
       sf *= twoljsq;
       f[i][1] += lj * particle_eky[i] - sf;
 
-      sf = fsf_coeff4 * sin(ftwo_pi * s3);
-      sf += fsf_coeff5 * sin(ffour_pi * s3);
+      sf = fsf_coeff4 * std::sin(ftwo_pi * s3);
+      sf += fsf_coeff5 * std::sin(ffour_pi * s3);
       sf *= twoljsq;
 
       if (slabflag != 2) f[i][2] += lj * particle_ekz[i] -  sf;
@@ -2702,22 +2702,22 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
       const flt_t s1 = x[i][0] * hx_inv;
       const flt_t s2 = x[i][1] * hy_inv;
       const flt_t s3 = x[i][2] * hz_inv;
-      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
-      sf += fsf_coeff1 * sin(ffour_pi * s1);
+      flt_t sf = fsf_coeff0 * std::sin(ftwo_pi * s1);
+      sf += fsf_coeff1 * std::sin(ffour_pi * s1);
       sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
       f[i][0] += lj0*particle_ekx0[i] + lj1*particle_ekx1[i] +
         lj2*particle_ekx2[i] + lj3*particle_ekx3[i] + lj4*particle_ekx4[i] +
         lj5*particle_ekx5[i] + lj6*particle_ekx6[i] - sf;
 
-      sf = fsf_coeff2 * sin(ftwo_pi * s2);
-      sf += fsf_coeff3 * sin(ffour_pi * s2);
+      sf = fsf_coeff2 * std::sin(ftwo_pi * s2);
+      sf += fsf_coeff3 * std::sin(ffour_pi * s2);
       sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
       f[i][1] += lj0*particle_eky0[i] + lj1*particle_eky1[i] +
         lj2*particle_eky2[i] + lj3*particle_eky3[i] + lj4*particle_eky4[i] +
         lj5*particle_eky5[i] + lj6*particle_eky6[i] - sf;
 
-      sf = fsf_coeff4 * sin(ftwo_pi * s3);
-      sf += fsf_coeff5 * sin(ffour_pi * s3);
+      sf = fsf_coeff4 * std::sin(ftwo_pi * s3);
+      sf += fsf_coeff5 * std::sin(ffour_pi * s3);
       sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
       if (slabflag != 2)
       f[i][2] += lj0*particle_ekz0[i] + lj1*particle_ekz1[i] +
@@ -3101,14 +3101,14 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
       const flt_t s1 = x[i][0] * hx_inv;
       const flt_t s2 = x[i][1] * hy_inv;
       const flt_t s3 = x[i][2] * hz_inv;
-      flt_t sf1 = fsf_coeff0 * sin(ftwo_pi * s1);
-      sf1 += fsf_coeff1 * sin(ffour_pi * s1);
+      flt_t sf1 = fsf_coeff0 * std::sin(ftwo_pi * s1);
+      sf1 += fsf_coeff1 * std::sin(ffour_pi * s1);
 
-      flt_t sf2 = fsf_coeff2 * sin(ftwo_pi * s2);
-      sf2 += fsf_coeff3 * sin(ffour_pi * s2);
+      flt_t sf2 = fsf_coeff2 * std::sin(ftwo_pi * s2);
+      sf2 += fsf_coeff3 * std::sin(ffour_pi * s2);
 
-      flt_t sf3 = fsf_coeff4 * sin(ftwo_pi * s3);
-      sf3 += fsf_coeff5 * sin(ffour_pi * s3);
+      flt_t sf3 = fsf_coeff4 * std::sin(ftwo_pi * s3);
+      sf3 += fsf_coeff5 * std::sin(ffour_pi * s3);
       for (int k = 0; k < nsplit; k++) {
         const flt_t lj = B[nsplit*type + k];
         const flt_t twoljsq = lj*lj * B[k] * 2;
diff --git a/src/INTEL/pppm_intel.cpp b/src/INTEL/pppm_intel.cpp
index 2ceca54d29..f67b3a89b3 100644
--- a/src/INTEL/pppm_intel.cpp
+++ b/src/INTEL/pppm_intel.cpp
@@ -953,18 +953,18 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
       const flt_t s1 = x[i].x * hx_inv;
       const flt_t s2 = x[i].y * hy_inv;
       const flt_t s3 = x[i].z * hz_inv;
-      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
-      sf += fsf_coeff1 * sin(ffour_pi * s1);
+      flt_t sf = fsf_coeff0 * std::sin(ftwo_pi * s1);
+      sf += fsf_coeff1 * std::sin(ffour_pi * s1);
       sf *= twoqsq;
       f[i].x += qfactor * particle_ekx[i] - fqqrd2es * sf;
 
-      sf = fsf_coeff2 * sin(ftwo_pi * s2);
-      sf += fsf_coeff3 * sin(ffour_pi * s2);
+      sf = fsf_coeff2 * std::sin(ftwo_pi * s2);
+      sf += fsf_coeff3 * std::sin(ffour_pi * s2);
       sf *= twoqsq;
       f[i].y += qfactor * particle_eky[i] - fqqrd2es * sf;
 
-      sf = fsf_coeff4 * sin(ftwo_pi * s3);
-      sf += fsf_coeff5 * sin(ffour_pi * s3);
+      sf = fsf_coeff4 * std::sin(ftwo_pi * s3);
+      sf += fsf_coeff5 * std::sin(ffour_pi * s3);
       sf *= twoqsq;
 
       if (slabflag != 2) f[i].z += qfactor * particle_ekz[i] - fqqrd2es * sf;
diff --git a/src/INTEL/verlet_lrt_intel.cpp b/src/INTEL/verlet_lrt_intel.cpp
index 42414420b6..9df17d8cef 100644
--- a/src/INTEL/verlet_lrt_intel.cpp
+++ b/src/INTEL/verlet_lrt_intel.cpp
@@ -19,7 +19,6 @@
 #include "atom_vec.h"
 #include "bond.h"
 #include "comm.h"
-#include "compute.h"
 #include "dihedral.h"
 #include "domain.h"
 #include "error.h"
@@ -27,7 +26,6 @@
 #include "force.h"
 #include "improper.h"
 #include "kspace.h"
-#include "memory.h"
 #include "modify.h"
 #include "neighbor.h"
 #include "output.h"
@@ -35,9 +33,7 @@
 #include "timer.h"
 #include "update.h"
 
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
+#include <cstring>
 
 using namespace LAMMPS_NS;
 
@@ -67,7 +63,7 @@ void VerletLRTIntel::init()
 {
   Verlet::init();
 
-  _intel_kspace = dynamic_cast<PPPMIntel*>(force->kspace_match("^pppm\\..*intel$", 0));
+  _intel_kspace = dynamic_cast<PPPMIntel*>(force->kspace_match("^pppm/.*intel$", 0));
   // include pppm/electrode/intel
 
   #ifndef LMP_INTEL_USELRT
@@ -95,12 +91,15 @@ void VerletLRTIntel::setup(int flag)
   }
   #endif
 
-  if (comm->me == 0) {
-    fprintf(screen,"Setting up VerletLRTIntel run ...\n");
-    fprintf(screen,"  Unit style    : %s\n", update->unit_style);
-    fmt::print(screen,"  Current step  : {}\n", update->ntimestep);
-    fprintf(screen,"  Time step     : %g\n", update->dt);
-    timer->print_timeout(screen);
+  if (comm->me == 0 && screen) {
+    fputs("Setting up VerletLRTIntel run ...\n",screen);
+    if (flag) {
+      fmt::print(screen,"  Unit style    : {}\n"
+                        "  Current step  : {}\n"
+                        "  Time step     : {}\n",
+                 update->unit_style,update->ntimestep,update->dt);
+      timer->print_timeout(screen);
+    }
   }
 
   #if defined(_LMP_INTEL_LRT_PTHREAD)
@@ -110,8 +109,9 @@ void VerletLRTIntel::setup(int flag)
     sched_getaffinity(0, sizeof(cpuset), &cpuset);
     int my_cpu_count = CPU_COUNT(&cpuset);
     if (my_cpu_count < comm->nthreads + 1) {
-      error->warning(FLERR, "Using {} threads per MPI rank, but only {} core(s) allocated "
-                     "for each MPI rank", comm->nthreads + 1, my_cpu_count);
+      error->warning(FLERR, "Using {} threads per MPI rank, but only {} "
+                     "core(s) allocated for each MPI rank",
+                     comm->nthreads + 1, my_cpu_count);
     }
   }
   #endif
@@ -144,6 +144,7 @@ void VerletLRTIntel::setup(int flag)
   domain->box_too_small_check();
   modify->setup_pre_neighbor();
   neighbor->build(1);
+  modify->setup_post_neighbor();
   neighbor->ncalls = 0;
 
   // compute all forces
@@ -189,11 +190,11 @@ void VerletLRTIntel::setup(int flag)
 
   if (kspace_compute_flag) _intel_kspace->compute_second(eflag,vflag);
 
-  modify->pre_reverse(eflag,vflag);
+  modify->setup_pre_reverse(eflag,vflag);
   if (force->newton) comm->reverse_comm();
 
   modify->setup(vflag);
-  output->setup();
+  output->setup(flag);
   update->setupflag = 0;
 }
 
@@ -214,9 +215,10 @@ void VerletLRTIntel::run(int n)
   int n_post_integrate = modify->n_post_integrate;
   int n_pre_exchange = modify->n_pre_exchange;
   int n_pre_neighbor = modify->n_pre_neighbor;
+  int n_post_neighbor = modify->n_post_neighbor;
   int n_pre_force = modify->n_pre_force;
   int n_pre_reverse = modify->n_pre_reverse;
-  int n_post_force = modify->n_post_force_any;
+  int n_post_force_any = modify->n_post_force_any;
   int n_end_of_step = modify->n_end_of_step;
 
   if (atom->sortfreq > 0) sortflag = 1;
@@ -279,6 +281,10 @@ void VerletLRTIntel::run(int n)
       }
       neighbor->build(1);
       timer->stamp(Timer::NEIGH);
+      if (n_post_neighbor) {
+        modify->post_neighbor();
+        timer->stamp(Timer::MODIFY);
+      }
     }
 
     // force computations
@@ -353,7 +359,7 @@ void VerletLRTIntel::run(int n)
 
     // force modifications, final time integration, diagnostics
 
-    if (n_post_force) modify->post_force(vflag);
+    if (n_post_force_any) modify->post_force(vflag);
     modify->final_integrate();
     if (n_end_of_step) modify->end_of_step();
     timer->stamp(Timer::MODIFY);
diff --git a/src/KOKKOS/pair_eam_alloy_kokkos.cpp b/src/KOKKOS/pair_eam_alloy_kokkos.cpp
index ec72cabf33..2b81e21e2b 100644
--- a/src/KOKKOS/pair_eam_alloy_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_alloy_kokkos.cpp
@@ -32,10 +32,10 @@
 #include "potential_file_reader.h"
 
 #include <cmath>
-#include <cstring>
-
 using namespace LAMMPS_NS;
 
+#define MAX_CACHE_ROWS 500
+
 // Cannot use virtual inheritance on the GPU, so must duplicate code
 
 /* ---------------------------------------------------------------------- */
@@ -46,6 +46,7 @@ PairEAMAlloyKokkos<DeviceType>::PairEAMAlloyKokkos(LAMMPS *lmp) : PairEAM(lmp)
   respa_enable = 0;
   single_enable = 0;
   one_coeff = 1;
+  manybody_flag = 1;
 
   kokkosable = 1;
   atomKK = (AtomKokkos *) atom;
@@ -59,10 +60,10 @@ PairEAMAlloyKokkos<DeviceType>::PairEAMAlloyKokkos(LAMMPS *lmp) : PairEAM(lmp)
 template<class DeviceType>
 PairEAMAlloyKokkos<DeviceType>::~PairEAMAlloyKokkos()
 {
-  if (!copymode) {
-    memoryKK->destroy_kokkos(k_eatom,eatom);
-    memoryKK->destroy_kokkos(k_vatom,vatom);
-  }
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_eatom,eatom);
+  memoryKK->destroy_kokkos(k_vatom,vatom);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -118,7 +119,7 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_numneigh = k_list->d_numneigh;
   d_neighbors = k_list->d_neighbors;
   d_ilist = k_list->d_ilist;
-  int inum = list->inum;
+  inum = list->inum;
 
   need_dup = lmp->kokkos->need_dup<DeviceType>();
   if (need_dup) {
@@ -138,9 +139,9 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   // zero out density
 
   if (newton_pair)
-    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyInitialize>(0,nall),*this);
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyInitialize>(0,nall),*this);
   else
-    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyInitialize>(0,nlocal),*this);
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyInitialize>(0,nlocal),*this);
 
   // loop over neighbors of my atoms
 
@@ -152,15 +153,15 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
     if (neighflag == HALF) {
       if (newton_pair) {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelA<HALF,1> >(0,inum),*this);
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyKernelA<HALF,1>>(0,inum),*this);
       } else {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelA<HALF,0> >(0,inum),*this);
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyKernelA<HALF,0>>(0,inum),*this);
       }
     } else if (neighflag == HALFTHREAD) {
       if (newton_pair) {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelA<HALFTHREAD,1> >(0,inum),*this);
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyKernelA<HALFTHREAD,1>>(0,inum),*this);
       } else {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelA<HALFTHREAD,0> >(0,inum),*this);
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyKernelA<HALFTHREAD,0>>(0,inum),*this);
       }
     }
 
@@ -178,18 +179,22 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     // compute kernel B
 
     if (eflag)
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelB<1> >(0,inum),*this,ev);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyKernelB<1>>(0,inum),*this,ev);
     else
-      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelB<0> >(0,inum),*this);
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyKernelB<0>>(0,inum),*this);
 
   } else if (neighflag == FULL) {
 
     // compute kernel AB
 
     if (eflag)
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelAB<1> >(0,inum),*this,ev);
+      Kokkos::parallel_reduce(
+             Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyKernelAB<1>>(0,inum),
+             *this,ev);
     else
-      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelAB<0> >(0,inum),*this);
+      Kokkos::parallel_for(
+            policyInstance<TagPairEAMAlloyKernelAB<0>>::get(inum),
+            *this);
   }
 
   if (eflag) {
@@ -208,41 +213,65 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   if (evflag) {
     if (neighflag == HALF) {
       if (newton_pair) {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelC<HALF,1,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyKernelC<HALF,1,1>>(0,inum),
+              *this,ev);
       } else {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelC<HALF,0,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyKernelC<HALF,0,1>>(0,inum),
+              *this,ev);
       }
     } else if (neighflag == HALFTHREAD) {
       if (newton_pair) {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelC<HALFTHREAD,1,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyKernelC<HALFTHREAD,1,1>>(0,inum),
+              *this,ev);
       } else {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelC<HALFTHREAD,0,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyKernelC<HALFTHREAD,0,1>>(0,inum),
+              *this,ev);
       }
     } else if (neighflag == FULL) {
       if (newton_pair) {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelC<FULL,1,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyKernelC<FULL,1,1>>(0,inum),
+              *this,ev);
       } else {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelC<FULL,0,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyKernelC<FULL,0,1>>(0,inum),
+              *this,ev);
       }
     }
   } else {
     if (neighflag == HALF) {
       if (newton_pair) {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelC<HALF,1,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMAlloyKernelC<HALF,1,0>>::get(inum),
+              *this);
       } else {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelC<HALF,0,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMAlloyKernelC<HALF,0,0>>::get(inum),
+              *this);
       }
     } else if (neighflag == HALFTHREAD) {
       if (newton_pair) {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelC<HALFTHREAD,1,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMAlloyKernelC<HALFTHREAD,1,0>>::get(inum),
+              *this);
       } else {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelC<HALFTHREAD,0,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMAlloyKernelC<HALFTHREAD,0,0>>::get(inum),
+              *this);
       }
     } else if (neighflag == FULL) {
       if (newton_pair) {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelC<FULL,1,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMAlloyKernelC<FULL,1,0>>::get(inum),
+              *this);
       } else {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyKernelC<FULL,0,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMAlloyKernelC<FULL,0,0>>::get(inum),
+              *this);
       }
     }
   }
@@ -427,7 +456,7 @@ int PairEAMAlloyKokkos<DeviceType>::pack_forward_comm_kokkos(int n, DAT::tdual_i
   d_sendlist = k_sendlist.view<DeviceType>();
   iswap = iswap_in;
   v_buf = buf.view<DeviceType>();
-  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyPackForwardComm>(0,n),*this);
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyPackForwardComm>(0,n),*this);
   return n;
 }
 
@@ -445,7 +474,7 @@ void PairEAMAlloyKokkos<DeviceType>::unpack_forward_comm_kokkos(int n, int first
 {
   first = first_in;
   v_buf = buf.view<DeviceType>();
-  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMAlloyUnpackForwardComm>(0,n),*this);
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMAlloyUnpackForwardComm>(0,n),*this);
 }
 
 template<class DeviceType>
@@ -588,6 +617,7 @@ template<class DeviceType>
 template<int EFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelB<EFLAG>, const int &ii, EV_FLOAT& ev) const {
+
   // fp = derivative of embedding energy at each atom
   // phi = embedding energy at each atom
   // if rho > rhomax (e.g. due to close approach of two atoms),
@@ -644,6 +674,7 @@ void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelAB<EFLAG>,
   for (int jj = 0; jj < jnum; jj++) {
     int j = d_neighbors(i,jj);
     j &= NEIGHMASK;
+
     const X_FLOAT delx = xtmp - x(j,0);
     const X_FLOAT dely = ytmp - x(j,1);
     const X_FLOAT delz = ztmp - x(j,2);
@@ -806,6 +837,235 @@ void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelC<NEIGHFLAG
 
 /* ---------------------------------------------------------------------- */
 
+////Specialisation for Neighborlist types Half, HalfThread, Full
+template<class DeviceType>
+template<int EFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelAB<EFLAG>,
+                                     const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member,
+                                     EV_FLOAT& ev) const {
+  int ii = team_member.league_rank()*team_member.team_size() + team_member.team_rank();
+  // rho = density at each atom
+  // loop over neighbors of my atoms
+  const int m_max = d_rhor_spline.extent_int(1);
+  const int j_max = t_ffloat_2d_n7::static_extent(2);
+  const int d_rhor_spline_cached = (m_max > MAX_CACHE_ROWS) ? 0 : 1;
+  Kokkos::View<double*[t_ffloat_2d_n7::static_extent(2)], typename DeviceType::scratch_memory_space,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>> A(team_member.team_scratch(0), MAX_CACHE_ROWS);
+
+  if (d_rhor_spline_cached) {
+    for(int i = team_member.team_rank(); i < m_max*j_max; i+= team_member.team_size()) {
+      int j = i%j_max;
+      int m = i/j_max;
+      A(m,j) = d_rhor_spline(0,m,j);
+    }
+    team_member.team_barrier();
+  }
+  if (ii < inum) {
+    const int i = d_ilist[ii];
+    const X_FLOAT xtmp = x(i,0);
+    const X_FLOAT ytmp = x(i,1);
+    const X_FLOAT ztmp = x(i,2);
+    const int itype = type(i);
+
+    const int jnum = d_numneigh[i];
+
+    F_FLOAT rhotmp = 0.0;
+
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = d_neighbors(i,jj);
+      j &= NEIGHMASK;
+
+      const X_FLOAT delx = xtmp - x(j,0);
+      const X_FLOAT dely = ytmp - x(j,1);
+      const X_FLOAT delz = ztmp - x(j,2);
+      const int jtype = type(j);
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cutforcesq) {
+        F_FLOAT p = sqrt(rsq)*rdr + 1.0;
+        int m = static_cast<int> (p);
+        m = MIN(m,nr-1);
+        p -= m;
+        p = MIN(p,1.0);
+        const int d_type2rhor_ji = d_type2rhor(jtype,itype);
+        if (d_type2rhor_ji == 0 && d_rhor_spline_cached == 1) {
+          rhotmp += ((A(m,3)*p + A(m,4))*p +
+                       A(m,5))*p + A(m,6);
+        } else
+          rhotmp += ((d_rhor_spline(d_type2rhor_ji,m,3)*p + d_rhor_spline(d_type2rhor_ji,m,4))*p +
+                      d_rhor_spline(d_type2rhor_ji,m,5))*p + d_rhor_spline(d_type2rhor_ji,m,6);
+      }
+
+    }
+    d_rho[i] += rhotmp;
+
+    // fp = derivative of embedding energy at each atom
+    // phi = embedding energy at each atom
+    // if rho > rhomax (e.g. due to close approach of two atoms),
+    //   will exceed table, so add linear term to conserve energy
+
+    F_FLOAT p = d_rho[i]*rdrho + 1.0;
+    int m = static_cast<int> (p);
+    m = MAX(1,MIN(m,nrho-1));
+    p -= m;
+    p = MIN(p,1.0);
+    const int d_type2frho_i = d_type2frho[itype];
+    d_fp[i] = (d_frho_spline(d_type2frho_i,m,0)*p + d_frho_spline(d_type2frho_i,m,1))*p + d_frho_spline(d_type2frho_i,m,2);
+    if (EFLAG) {
+      F_FLOAT phi = ((d_frho_spline(d_type2frho_i,m,3)*p + d_frho_spline(d_type2frho_i,m,4))*p +
+                      d_frho_spline(d_type2frho_i,m,5))*p + d_frho_spline(d_type2frho_i,m,6);
+      if (d_rho[i] > rhomax) phi += d_fp[i] * (d_rho[i]-rhomax);
+      if (eflag_global) ev.evdwl += phi;
+      if (eflag_atom) d_eatom[i] += phi;
+    }
+  }
+}
+
+template<class DeviceType>
+template<int EFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelAB<EFLAG>,
+                                           const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member) const {
+  EV_FLOAT ev;
+  this->template operator()<EFLAG>(TagPairEAMAlloyKernelAB<EFLAG>(), team_member, ev);
+}
+
+/* ---------------------------------------------------------------------- */
+
+////Specialisation for Neighborlist types Half, HalfThread, Full
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>,
+                                           const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member,
+                                           EV_FLOAT& ev) const {
+
+  int ii = team_member.league_rank()*team_member.team_size() + team_member.team_rank();
+
+  // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
+
+  auto v_f = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f);
+  auto a_f = v_f.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+
+  const int m_max = d_z2r_spline.extent_int(1);
+  const int j_max = t_ffloat_2d_n7::static_extent(2);
+  const int d_z2r_spline_cached = (m_max > MAX_CACHE_ROWS) ? 0 : 1;
+  Kokkos::View<double*[t_ffloat_2d_n7::static_extent(2)], typename DeviceType::scratch_memory_space,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>> A(team_member.team_scratch(0), MAX_CACHE_ROWS);
+
+  if (d_z2r_spline_cached) {
+    for(int i = team_member.team_rank(); i < m_max*j_max; i+= team_member.team_size()) {
+      int j = i%j_max;
+      int m = i/j_max;
+      A(m,j) = d_z2r_spline(0,m,j);
+    }
+    team_member.team_barrier();
+  }
+  if (ii < inum) {
+    const int i = d_ilist[ii];
+    const X_FLOAT xtmp = x(i,0);
+    const X_FLOAT ytmp = x(i,1);
+    const X_FLOAT ztmp = x(i,2);
+    const int itype = type(i);
+
+    const int jnum = d_numneigh[i];
+
+    F_FLOAT fxtmp = 0.0;
+    F_FLOAT fytmp = 0.0;
+    F_FLOAT fztmp = 0.0;
+
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = d_neighbors(i,jj);
+      j &= NEIGHMASK;
+      const X_FLOAT delx = xtmp - x(j,0);
+      const X_FLOAT dely = ytmp - x(j,1);
+      const X_FLOAT delz = ztmp - x(j,2);
+      const int jtype = type(j);
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cutforcesq) {
+        const F_FLOAT r = sqrt(rsq);
+        F_FLOAT p = r*rdr + 1.0;
+        int m = static_cast<int> (p);
+        m = MIN(m,nr-1);
+        p -= m;
+        p = MIN(p,1.0);
+
+        // rhoip = derivative of (density at atom j due to atom i)
+        // rhojp = derivative of (density at atom i due to atom j)
+        // phi = pair potential energy
+        // phip = phi'
+        // z2 = phi * r
+        // z2p = (phi * r)' = (phi' r) + phi
+        // psip needs both fp[i] and fp[j] terms since r_ij appears in two
+        //   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
+        //   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
+
+        const int d_type2rhor_ij = d_type2rhor(itype,jtype);
+        const F_FLOAT rhoip = (d_rhor_spline(d_type2rhor_ij,m,0)*p + d_rhor_spline(d_type2rhor_ij,m,1))*p +
+                             d_rhor_spline(d_type2rhor_ij,m,2);
+        const int d_type2rhor_ji = d_type2rhor(jtype,itype);
+        const F_FLOAT rhojp = (d_rhor_spline(d_type2rhor_ji,m,0)*p + d_rhor_spline(d_type2rhor_ji,m,1))*p +
+                               d_rhor_spline(d_type2rhor_ji,m,2);
+        const int d_type2z2r_ij = d_type2z2r(itype,jtype);
+
+        const auto have_cache = (d_z2r_spline_cached == 1) && (0 == d_type2z2r_ij);
+        const auto z2r_spline_3 = (have_cache) ? A(m,3) : d_z2r_spline(d_type2z2r_ij,m,3);
+        const auto z2r_spline_4 = (have_cache) ? A(m,4) : d_z2r_spline(d_type2z2r_ij,m,4);
+        const auto z2r_spline_5 = (have_cache) ? A(m,5) : d_z2r_spline(d_type2z2r_ij,m,5);
+        const auto z2r_spline_6 = (have_cache) ? A(m,6) : d_z2r_spline(d_type2z2r_ij,m,6);
+
+        const F_FLOAT z2p = (3.0*rdr*z2r_spline_3*p + 2.0*rdr*z2r_spline_4)*p +
+                             rdr*z2r_spline_5; // the rdr and the factors of 3.0 and 2.0 come out of the interpolate function
+        const F_FLOAT z2 = ((z2r_spline_3*p + z2r_spline_4)*p +
+                             z2r_spline_5)*p + z2r_spline_6;
+
+        const F_FLOAT recip = 1.0/r;
+        const F_FLOAT phi = z2*recip;
+        const F_FLOAT phip = z2p*recip - phi*recip;
+        const F_FLOAT psip = d_fp[i]*rhojp + d_fp[j]*rhoip + phip;
+        const F_FLOAT fpair = -psip*recip;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+
+        if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
+          a_f(j,0) -= delx*fpair;
+          a_f(j,1) -= dely*fpair;
+          a_f(j,2) -= delz*fpair;
+        }
+
+        if (EVFLAG) {
+          if (eflag) {
+            ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<nlocal)))?1.0:0.5)*phi;
+          }
+
+          if (vflag_either || eflag_atom) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,phi,fpair,delx,dely,delz);
+        }
+
+      }
+    }
+
+    a_f(i,0) += fxtmp;
+    a_f(i,1) += fytmp;
+    a_f(i,2) += fztmp;
+  }
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>,
+                /*const int &ii*/
+                const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member) const {
+  EV_FLOAT ev;
+  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairEAMAlloyKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(), team_member, ev);
+}
+
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 template<int NEIGHFLAG, int NEWTON_PAIR>
 KOKKOS_INLINE_FUNCTION
@@ -1214,6 +1474,35 @@ void PairEAMAlloyKokkos<DeviceType>::file2array_alloy()
 
 /* ---------------------------------------------------------------------- */
 
+template<typename DeviceType>
+template<class TAG>
+struct PairEAMAlloyKokkos<DeviceType>::policyInstance {
+  KOKKOS_INLINE_FUNCTION
+  static auto get(int inum) {
+    auto policy = Kokkos::RangePolicy<DeviceType, TAG>(0,inum);
+    return policy;
+  }
+};
+
+#ifdef KOKKOS_ENABLE_HIP
+template<>
+template<class TAG>
+struct PairEAMAlloyKokkos<Kokkos::Experimental::HIP>::policyInstance {
+  KOKKOS_INLINE_FUNCTION
+  static auto get(int inum) {
+    static_assert(t_ffloat_2d_n7::static_extent(2) == 7,
+                  "Breaking assumption of spline dim for KernelAB and KernelC scratch caching");
+
+    auto policy = Kokkos::TeamPolicy<Kokkos::Experimental::HIP,TAG>((inum+1023)/1024, 1024)
+                           .set_scratch_size(0,
+                                Kokkos::PerTeam(MAX_CACHE_ROWS*7*sizeof(double)));
+    return policy;
+  }
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
 namespace LAMMPS_NS {
 template class PairEAMAlloyKokkos<LMPDeviceType>;
 #ifdef LMP_KOKKOS_GPU
diff --git a/src/KOKKOS/pair_eam_alloy_kokkos.h b/src/KOKKOS/pair_eam_alloy_kokkos.h
index c6cb73ca5d..2eb40189ac 100644
--- a/src/KOKKOS/pair_eam_alloy_kokkos.h
+++ b/src/KOKKOS/pair_eam_alloy_kokkos.h
@@ -61,7 +61,6 @@ class PairEAMAlloyKokkos : public PairEAM, public KokkosBase {
   ~PairEAMAlloyKokkos() override;
   void compute(int, int) override;
   void init_style() override;
-  void *extract(const char *, int &) override { return nullptr; }
   void coeff(int, char **) override;
 
   KOKKOS_INLINE_FUNCTION
@@ -93,6 +92,14 @@ class PairEAMAlloyKokkos : public PairEAM, public KokkosBase {
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairEAMAlloyKernelAB<EFLAG>, const int&) const;
 
+  template<int EFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairEAMAlloyKernelAB<EFLAG>, const typename Kokkos::TeamPolicy<DeviceType>::member_type&, EV_FLOAT&) const;
+
+  template<int EFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairEAMAlloyKernelAB<EFLAG>, const typename Kokkos::TeamPolicy<DeviceType>::member_type&) const;
+
   template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairEAMAlloyKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&, EV_FLOAT&) const;
@@ -101,6 +108,14 @@ class PairEAMAlloyKokkos : public PairEAM, public KokkosBase {
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairEAMAlloyKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;
 
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairEAMAlloyKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const typename Kokkos::TeamPolicy<DeviceType>::member_type&, EV_FLOAT&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairEAMAlloyKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const typename Kokkos::TeamPolicy<DeviceType>::member_type&) const;
+
   template<int NEIGHFLAG, int NEWTON_PAIR>
   KOKKOS_INLINE_FUNCTION
   void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
@@ -125,7 +140,7 @@ class PairEAMAlloyKokkos : public PairEAM, public KokkosBase {
   typename AT::t_efloat_1d d_eatom;
   typename AT::t_virial_array d_vatom;
 
-  int need_dup;
+  int need_dup,inum;
 
   using KKDeviceType = typename KKDevice<DeviceType>::value;
 
@@ -139,7 +154,6 @@ class PairEAMAlloyKokkos : public PairEAM, public KokkosBase {
   DupScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout> dup_f;
   DupScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout> dup_eatom;
   DupScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout> dup_vatom;
-
   NonDupScatterView<F_FLOAT*, typename DAT::t_ffloat_1d::array_layout> ndup_rho;
   NonDupScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout> ndup_f;
   NonDupScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout> ndup_eatom;
@@ -163,17 +177,18 @@ class PairEAMAlloyKokkos : public PairEAM, public KokkosBase {
   t_ffloat_2d_n7 d_frho_spline;
   t_ffloat_2d_n7 d_rhor_spline;
   t_ffloat_2d_n7 d_z2r_spline;
-
+  void interpolate(int, double, double *, t_host_ffloat_2d_n7, int);
   void file2array() override;
   void file2array_alloy();
   void array2spline() override;
-  void interpolate(int, double, double *, t_host_ffloat_2d_n7, int);
   void read_file(char *) override;
 
+  template<class TAG>
+  struct policyInstance;
+
   typename AT::t_neighbors_2d d_neighbors;
   typename AT::t_int_1d d_ilist;
   typename AT::t_int_1d d_numneigh;
-  //NeighListKokkos<DeviceType> k_list;
 
   int iswap;
   int first;
@@ -187,7 +202,6 @@ class PairEAMAlloyKokkos : public PairEAM, public KokkosBase {
 };
 
 }
-
 #endif
 #endif
 
diff --git a/src/KOKKOS/pair_eam_fs_kokkos.cpp b/src/KOKKOS/pair_eam_fs_kokkos.cpp
index 50abbc3191..4572d14e48 100644
--- a/src/KOKKOS/pair_eam_fs_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_fs_kokkos.cpp
@@ -32,10 +32,10 @@
 #include "potential_file_reader.h"
 
 #include <cmath>
-#include <cstring>
-
 using namespace LAMMPS_NS;
 
+#define MAX_CACHE_ROWS 500
+
 // Cannot use virtual inheritance on the GPU, so must duplicate code
 
 /* ---------------------------------------------------------------------- */
@@ -46,6 +46,7 @@ PairEAMFSKokkos<DeviceType>::PairEAMFSKokkos(LAMMPS *lmp) : PairEAM(lmp)
   respa_enable = 0;
   single_enable = 0;
   one_coeff = 1;
+  manybody_flag = 1;
 
   kokkosable = 1;
   atomKK = (AtomKokkos *) atom;
@@ -59,10 +60,10 @@ PairEAMFSKokkos<DeviceType>::PairEAMFSKokkos(LAMMPS *lmp) : PairEAM(lmp)
 template<class DeviceType>
 PairEAMFSKokkos<DeviceType>::~PairEAMFSKokkos()
 {
-  if (!copymode) {
-    memoryKK->destroy_kokkos(k_eatom,eatom);
-    memoryKK->destroy_kokkos(k_vatom,vatom);
-  }
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_eatom,eatom);
+  memoryKK->destroy_kokkos(k_vatom,vatom);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -118,7 +119,7 @@ void PairEAMFSKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_numneigh = k_list->d_numneigh;
   d_neighbors = k_list->d_neighbors;
   d_ilist = k_list->d_ilist;
-  int inum = list->inum;
+  inum = list->inum;
 
   need_dup = lmp->kokkos->need_dup<DeviceType>();
   if (need_dup) {
@@ -138,9 +139,9 @@ void PairEAMFSKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   // zero out density
 
   if (newton_pair)
-    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMFSInitialize>(0,nall),*this);
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMFSInitialize>(0,nall),*this);
   else
-    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMFSInitialize>(0,nlocal),*this);
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMFSInitialize>(0,nlocal),*this);
 
   // loop over neighbors of my atoms
 
@@ -152,15 +153,15 @@ void PairEAMFSKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
     if (neighflag == HALF) {
       if (newton_pair) {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelA<HALF,1> >(0,inum),*this);
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMFSKernelA<HALF,1>>(0,inum),*this);
       } else {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelA<HALF,0> >(0,inum),*this);
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMFSKernelA<HALF,0>>(0,inum),*this);
       }
     } else if (neighflag == HALFTHREAD) {
       if (newton_pair) {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelA<HALFTHREAD,1> >(0,inum),*this);
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMFSKernelA<HALFTHREAD,1>>(0,inum),*this);
       } else {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelA<HALFTHREAD,0> >(0,inum),*this);
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMFSKernelA<HALFTHREAD,0>>(0,inum),*this);
       }
     }
 
@@ -178,18 +179,22 @@ void PairEAMFSKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     // compute kernel B
 
     if (eflag)
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelB<1> >(0,inum),*this,ev);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType,TagPairEAMFSKernelB<1>>(0,inum),*this,ev);
     else
-      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelB<0> >(0,inum),*this);
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMFSKernelB<0>>(0,inum),*this);
 
   } else if (neighflag == FULL) {
 
     // compute kernel AB
 
     if (eflag)
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelAB<1> >(0,inum),*this,ev);
+      Kokkos::parallel_reduce(
+             Kokkos::RangePolicy<DeviceType,TagPairEAMFSKernelAB<1>>(0,inum),
+             *this,ev);
     else
-      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelAB<0> >(0,inum),*this);
+      Kokkos::parallel_for(
+            policyInstance<TagPairEAMFSKernelAB<0>>::get(inum),
+            *this);
   }
 
   if (eflag) {
@@ -197,7 +202,7 @@ void PairEAMFSKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     ev.evdwl = 0.0;
   }
 
-  // communicate derivative of embedding function (on the device)
+  // communicate derivative of embedding function
 
   k_fp.template modify<DeviceType>();
   comm->forward_comm(this);
@@ -208,41 +213,65 @@ void PairEAMFSKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   if (evflag) {
     if (neighflag == HALF) {
       if (newton_pair) {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelC<HALF,1,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMFSKernelC<HALF,1,1>>(0,inum),
+              *this,ev);
       } else {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelC<HALF,0,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMFSKernelC<HALF,0,1>>(0,inum),
+              *this,ev);
       }
     } else if (neighflag == HALFTHREAD) {
       if (newton_pair) {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelC<HALFTHREAD,1,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMFSKernelC<HALFTHREAD,1,1>>(0,inum),
+              *this,ev);
       } else {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelC<HALFTHREAD,0,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMFSKernelC<HALFTHREAD,0,1>>(0,inum),
+              *this,ev);
       }
     } else if (neighflag == FULL) {
       if (newton_pair) {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelC<FULL,1,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMFSKernelC<FULL,1,1>>(0,inum),
+              *this,ev);
       } else {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelC<FULL,0,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMFSKernelC<FULL,0,1>>(0,inum),
+              *this,ev);
       }
     }
   } else {
     if (neighflag == HALF) {
       if (newton_pair) {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelC<HALF,1,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMFSKernelC<HALF,1,0>>::get(inum),
+              *this);
       } else {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelC<HALF,0,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMFSKernelC<HALF,0,0>>::get(inum),
+              *this);
       }
     } else if (neighflag == HALFTHREAD) {
       if (newton_pair) {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelC<HALFTHREAD,1,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMFSKernelC<HALFTHREAD,1,0>>::get(inum),
+              *this);
       } else {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelC<HALFTHREAD,0,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMFSKernelC<HALFTHREAD,0,0>>::get(inum),
+              *this);
       }
     } else if (neighflag == FULL) {
       if (newton_pair) {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelC<FULL,1,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMFSKernelC<FULL,1,0>>::get(inum),
+              *this);
       } else {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMFSKernelC<FULL,0,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMFSKernelC<FULL,0,0>>::get(inum),
+              *this);
       }
     }
   }
@@ -427,7 +456,7 @@ int PairEAMFSKokkos<DeviceType>::pack_forward_comm_kokkos(int n, DAT::tdual_int_
   d_sendlist = k_sendlist.view<DeviceType>();
   iswap = iswap_in;
   v_buf = buf.view<DeviceType>();
-  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMFSPackForwardComm>(0,n),*this);
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMFSPackForwardComm>(0,n),*this);
   return n;
 }
 
@@ -445,7 +474,7 @@ void PairEAMFSKokkos<DeviceType>::unpack_forward_comm_kokkos(int n, int first_in
 {
   first = first_in;
   v_buf = buf.view<DeviceType>();
-  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMFSUnpackForwardComm>(0,n),*this);
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagPairEAMFSUnpackForwardComm>(0,n),*this);
 }
 
 template<class DeviceType>
@@ -645,6 +674,7 @@ void PairEAMFSKokkos<DeviceType>::operator()(TagPairEAMFSKernelAB<EFLAG>, const
   for (int jj = 0; jj < jnum; jj++) {
     int j = d_neighbors(i,jj);
     j &= NEIGHMASK;
+
     const X_FLOAT delx = xtmp - x(j,0);
     const X_FLOAT dely = ytmp - x(j,1);
     const X_FLOAT delz = ztmp - x(j,2);
@@ -691,7 +721,7 @@ template<class DeviceType>
 template<int EFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairEAMFSKokkos<DeviceType>::operator()(TagPairEAMFSKernelAB<EFLAG>, const int &ii) const {
-  EV_FLOAT ev;
+  EV_FLOAT ev;  this->template operator()<EFLAG>(TagPairEAMFSKernelAB<EFLAG>(), ii, ev);
   this->template operator()<EFLAG>(TagPairEAMFSKernelAB<EFLAG>(), ii, ev);
 }
 
@@ -807,6 +837,235 @@ void PairEAMFSKokkos<DeviceType>::operator()(TagPairEAMFSKernelC<NEIGHFLAG,NEWTO
 
 /* ---------------------------------------------------------------------- */
 
+////Specialisation for Neighborlist types Half, HalfThread, Full
+template<class DeviceType>
+template<int EFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairEAMFSKokkos<DeviceType>::operator()(TagPairEAMFSKernelAB<EFLAG>,
+                                     const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member,
+                                     EV_FLOAT& ev) const {
+  int ii = team_member.league_rank()*team_member.team_size() + team_member.team_rank();
+  // rho = density at each atom
+  // loop over neighbors of my atoms
+  const int m_max = d_rhor_spline.extent_int(1);
+  const int j_max = t_ffloat_2d_n7::static_extent(2);
+  const int d_rhor_spline_cached = (m_max > MAX_CACHE_ROWS) ? 0 : 1;
+  Kokkos::View<double*[t_ffloat_2d_n7::static_extent(2)], typename DeviceType::scratch_memory_space,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>> A(team_member.team_scratch(0), MAX_CACHE_ROWS);
+
+  if (d_rhor_spline_cached) {
+    for(int i = team_member.team_rank(); i < m_max*j_max; i+= team_member.team_size()) {
+      int j = i%j_max;
+      int m = i/j_max;
+      A(m,j) = d_rhor_spline(0,m,j);
+    }
+    team_member.team_barrier();
+  }
+  if (ii < inum) {
+    const int i = d_ilist[ii];
+    const X_FLOAT xtmp = x(i,0);
+    const X_FLOAT ytmp = x(i,1);
+    const X_FLOAT ztmp = x(i,2);
+    const int itype = type(i);
+
+    const int jnum = d_numneigh[i];
+
+    F_FLOAT rhotmp = 0.0;
+
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = d_neighbors(i,jj);
+      j &= NEIGHMASK;
+
+      const X_FLOAT delx = xtmp - x(j,0);
+      const X_FLOAT dely = ytmp - x(j,1);
+      const X_FLOAT delz = ztmp - x(j,2);
+      const int jtype = type(j);
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cutforcesq) {
+        F_FLOAT p = sqrt(rsq)*rdr + 1.0;
+        int m = static_cast<int> (p);
+        m = MIN(m,nr-1);
+        p -= m;
+        p = MIN(p,1.0);
+        const int d_type2rhor_ji = d_type2rhor(jtype,itype);
+        if (d_type2rhor_ji == 0 && d_rhor_spline_cached == 1) {
+          rhotmp += ((A(m,3)*p + A(m,4))*p +
+                       A(m,5))*p + A(m,6);
+        } else
+          rhotmp += ((d_rhor_spline(d_type2rhor_ji,m,3)*p + d_rhor_spline(d_type2rhor_ji,m,4))*p +
+                      d_rhor_spline(d_type2rhor_ji,m,5))*p + d_rhor_spline(d_type2rhor_ji,m,6);
+      }
+
+    }
+    d_rho[i] += rhotmp;
+
+    // fp = derivative of embedding energy at each atom
+    // phi = embedding energy at each atom
+    // if rho > rhomax (e.g. due to close approach of two atoms),
+    //   will exceed table, so add linear term to conserve energy
+
+    F_FLOAT p = d_rho[i]*rdrho + 1.0;
+    int m = static_cast<int> (p);
+    m = MAX(1,MIN(m,nrho-1));
+    p -= m;
+    p = MIN(p,1.0);
+    const int d_type2frho_i = d_type2frho[itype];
+    d_fp[i] = (d_frho_spline(d_type2frho_i,m,0)*p + d_frho_spline(d_type2frho_i,m,1))*p + d_frho_spline(d_type2frho_i,m,2);
+    if (EFLAG) {
+      F_FLOAT phi = ((d_frho_spline(d_type2frho_i,m,3)*p + d_frho_spline(d_type2frho_i,m,4))*p +
+                      d_frho_spline(d_type2frho_i,m,5))*p + d_frho_spline(d_type2frho_i,m,6);
+      if (d_rho[i] > rhomax) phi += d_fp[i] * (d_rho[i]-rhomax);
+      if (eflag_global) ev.evdwl += phi;
+      if (eflag_atom) d_eatom[i] += phi;
+    }
+  }
+}
+
+template<class DeviceType>
+template<int EFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairEAMFSKokkos<DeviceType>::operator()(TagPairEAMFSKernelAB<EFLAG>,
+                                           const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member) const {
+  EV_FLOAT ev;
+  this->template operator()<EFLAG>(TagPairEAMFSKernelAB<EFLAG>(), team_member, ev);
+}
+
+/* ---------------------------------------------------------------------- */
+
+////Specialisation for Neighborlist types Half, HalfThread, Full
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairEAMFSKokkos<DeviceType>::operator()(TagPairEAMFSKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>,
+                                           const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member,
+                                           EV_FLOAT& ev) const {
+
+  int ii = team_member.league_rank()*team_member.team_size() + team_member.team_rank();
+
+  // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
+
+  auto v_f = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f);
+  auto a_f = v_f.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+
+  const int m_max = d_z2r_spline.extent_int(1);
+  const int j_max = t_ffloat_2d_n7::static_extent(2);
+  const int d_z2r_spline_cached = (m_max > MAX_CACHE_ROWS) ? 0 : 1;
+  Kokkos::View<double*[t_ffloat_2d_n7::static_extent(2)], typename DeviceType::scratch_memory_space,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>> A(team_member.team_scratch(0), MAX_CACHE_ROWS);
+
+  if (d_z2r_spline_cached) {
+    for(int i = team_member.team_rank(); i < m_max*j_max; i+= team_member.team_size()) {
+      int j = i%j_max;
+      int m = i/j_max;
+      A(m,j) = d_z2r_spline(0,m,j);
+    }
+    team_member.team_barrier();
+  }
+  if (ii < inum) {
+    const int i = d_ilist[ii];
+    const X_FLOAT xtmp = x(i,0);
+    const X_FLOAT ytmp = x(i,1);
+    const X_FLOAT ztmp = x(i,2);
+    const int itype = type(i);
+
+    const int jnum = d_numneigh[i];
+
+    F_FLOAT fxtmp = 0.0;
+    F_FLOAT fytmp = 0.0;
+    F_FLOAT fztmp = 0.0;
+
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = d_neighbors(i,jj);
+      j &= NEIGHMASK;
+      const X_FLOAT delx = xtmp - x(j,0);
+      const X_FLOAT dely = ytmp - x(j,1);
+      const X_FLOAT delz = ztmp - x(j,2);
+      const int jtype = type(j);
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cutforcesq) {
+        const F_FLOAT r = sqrt(rsq);
+        F_FLOAT p = r*rdr + 1.0;
+        int m = static_cast<int> (p);
+        m = MIN(m,nr-1);
+        p -= m;
+        p = MIN(p,1.0);
+
+        // rhoip = derivative of (density at atom j due to atom i)
+        // rhojp = derivative of (density at atom i due to atom j)
+        // phi = pair potential energy
+        // phip = phi'
+        // z2 = phi * r
+        // z2p = (phi * r)' = (phi' r) + phi
+        // psip needs both fp[i] and fp[j] terms since r_ij appears in two
+        //   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
+        //   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
+
+        const int d_type2rhor_ij = d_type2rhor(itype,jtype);
+        const F_FLOAT rhoip = (d_rhor_spline(d_type2rhor_ij,m,0)*p + d_rhor_spline(d_type2rhor_ij,m,1))*p +
+                             d_rhor_spline(d_type2rhor_ij,m,2);
+        const int d_type2rhor_ji = d_type2rhor(jtype,itype);
+        const F_FLOAT rhojp = (d_rhor_spline(d_type2rhor_ji,m,0)*p + d_rhor_spline(d_type2rhor_ji,m,1))*p +
+                               d_rhor_spline(d_type2rhor_ji,m,2);
+        const int d_type2z2r_ij = d_type2z2r(itype,jtype);
+
+        const auto have_cache = (d_z2r_spline_cached == 1) && (0 == d_type2z2r_ij);
+        const auto z2r_spline_3 = (have_cache) ? A(m,3) : d_z2r_spline(d_type2z2r_ij,m,3);
+        const auto z2r_spline_4 = (have_cache) ? A(m,4) : d_z2r_spline(d_type2z2r_ij,m,4);
+        const auto z2r_spline_5 = (have_cache) ? A(m,5) : d_z2r_spline(d_type2z2r_ij,m,5);
+        const auto z2r_spline_6 = (have_cache) ? A(m,6) : d_z2r_spline(d_type2z2r_ij,m,6);
+
+        const F_FLOAT z2p = (3.0*rdr*z2r_spline_3*p + 2.0*rdr*z2r_spline_4)*p +
+                             rdr*z2r_spline_5; // the rdr and the factors of 3.0 and 2.0 come out of the interpolate function
+        const F_FLOAT z2 = ((z2r_spline_3*p + z2r_spline_4)*p +
+                             z2r_spline_5)*p + z2r_spline_6;
+
+        const F_FLOAT recip = 1.0/r;
+        const F_FLOAT phi = z2*recip;
+        const F_FLOAT phip = z2p*recip - phi*recip;
+        const F_FLOAT psip = d_fp[i]*rhojp + d_fp[j]*rhoip + phip;
+        const F_FLOAT fpair = -psip*recip;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+
+        if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
+          a_f(j,0) -= delx*fpair;
+          a_f(j,1) -= dely*fpair;
+          a_f(j,2) -= delz*fpair;
+        }
+
+        if (EVFLAG) {
+          if (eflag) {
+            ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<nlocal)))?1.0:0.5)*phi;
+          }
+
+          if (vflag_either || eflag_atom) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,phi,fpair,delx,dely,delz);
+        }
+
+      }
+    }
+
+    a_f(i,0) += fxtmp;
+    a_f(i,1) += fytmp;
+    a_f(i,2) += fztmp;
+  }
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairEAMFSKokkos<DeviceType>::operator()(TagPairEAMFSKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>,
+                /*const int &ii*/
+                const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member) const {
+  EV_FLOAT ev;
+  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairEAMFSKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(), team_member, ev);
+}
+
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 template<int NEIGHFLAG, int NEWTON_PAIR>
 KOKKOS_INLINE_FUNCTION
@@ -1225,6 +1484,35 @@ void PairEAMFSKokkos<DeviceType>::file2array_fs()
 
 /* ---------------------------------------------------------------------- */
 
+template<typename DeviceType>
+template<class TAG>
+struct PairEAMFSKokkos<DeviceType>::policyInstance {
+  KOKKOS_INLINE_FUNCTION
+  static auto get(int inum) {
+    auto policy = Kokkos::RangePolicy<DeviceType, TAG>(0,inum);
+    return policy;
+  }
+};
+
+#ifdef KOKKOS_ENABLE_HIP
+template<>
+template<class TAG>
+struct PairEAMFSKokkos<Kokkos::Experimental::HIP>::policyInstance {
+  KOKKOS_INLINE_FUNCTION
+  static auto get(int inum) {
+    static_assert(t_ffloat_2d_n7::static_extent(2) == 7,
+                  "Breaking assumption of spline dim for KernelAB and KernelC scratch caching");
+
+    auto policy = Kokkos::TeamPolicy<Kokkos::Experimental::HIP,TAG>((inum+1023)/1024, 1024)
+                           .set_scratch_size(0,
+                                Kokkos::PerTeam(MAX_CACHE_ROWS*7*sizeof(double)));
+    return policy;
+  }
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
 namespace LAMMPS_NS {
 template class PairEAMFSKokkos<LMPDeviceType>;
 #ifdef LMP_KOKKOS_GPU
diff --git a/src/KOKKOS/pair_eam_fs_kokkos.h b/src/KOKKOS/pair_eam_fs_kokkos.h
index 06a395cba7..bd03ab0015 100644
--- a/src/KOKKOS/pair_eam_fs_kokkos.h
+++ b/src/KOKKOS/pair_eam_fs_kokkos.h
@@ -61,7 +61,6 @@ class PairEAMFSKokkos : public PairEAM, public KokkosBase {
   ~PairEAMFSKokkos() override;
   void compute(int, int) override;
   void init_style() override;
-  void *extract(const char *, int &) override { return nullptr; }
   void coeff(int, char **) override;
 
   KOKKOS_INLINE_FUNCTION
@@ -93,6 +92,14 @@ class PairEAMFSKokkos : public PairEAM, public KokkosBase {
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairEAMFSKernelAB<EFLAG>, const int&) const;
 
+  template<int EFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairEAMFSKernelAB<EFLAG>, const typename Kokkos::TeamPolicy<DeviceType>::member_type&, EV_FLOAT&) const;
+
+  template<int EFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairEAMFSKernelAB<EFLAG>, const typename Kokkos::TeamPolicy<DeviceType>::member_type&) const;
+
   template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairEAMFSKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&, EV_FLOAT&) const;
@@ -101,6 +108,14 @@ class PairEAMFSKokkos : public PairEAM, public KokkosBase {
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairEAMFSKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;
 
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairEAMFSKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const typename Kokkos::TeamPolicy<DeviceType>::member_type&, EV_FLOAT&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairEAMFSKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const typename Kokkos::TeamPolicy<DeviceType>::member_type&) const;
+
   template<int NEIGHFLAG, int NEWTON_PAIR>
   KOKKOS_INLINE_FUNCTION
   void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
@@ -125,7 +140,7 @@ class PairEAMFSKokkos : public PairEAM, public KokkosBase {
   typename AT::t_efloat_1d d_eatom;
   typename AT::t_virial_array d_vatom;
 
-  int need_dup;
+  int need_dup,inum;
 
   using KKDeviceType = typename KKDevice<DeviceType>::value;
 
@@ -139,7 +154,6 @@ class PairEAMFSKokkos : public PairEAM, public KokkosBase {
   DupScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout> dup_f;
   DupScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout> dup_eatom;
   DupScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout> dup_vatom;
-
   NonDupScatterView<F_FLOAT*, typename DAT::t_ffloat_1d::array_layout> ndup_rho;
   NonDupScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout> ndup_f;
   NonDupScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout> ndup_eatom;
@@ -163,13 +177,15 @@ class PairEAMFSKokkos : public PairEAM, public KokkosBase {
   t_ffloat_2d_n7 d_frho_spline;
   t_ffloat_2d_n7 d_rhor_spline;
   t_ffloat_2d_n7 d_z2r_spline;
-
+  void interpolate(int, double, double *, t_host_ffloat_2d_n7, int);
   void file2array() override;
   void file2array_fs();
   void array2spline() override;
-  void interpolate(int, double, double *, t_host_ffloat_2d_n7, int);
   void read_file(char *) override;
 
+  template<class TAG>
+  struct policyInstance;
+
   typename AT::t_neighbors_2d d_neighbors;
   typename AT::t_int_1d d_ilist;
   typename AT::t_int_1d d_numneigh;
@@ -186,7 +202,6 @@ class PairEAMFSKokkos : public PairEAM, public KokkosBase {
 };
 
 }
-
 #endif
 #endif
 
diff --git a/src/KOKKOS/pair_eam_kokkos.cpp b/src/KOKKOS/pair_eam_kokkos.cpp
index 6e1cd1feea..de6d3646bf 100644
--- a/src/KOKKOS/pair_eam_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_kokkos.cpp
@@ -31,9 +31,10 @@
 #include "pair_kokkos.h"
 
 #include <cmath>
-
 using namespace LAMMPS_NS;
 
+#define MAX_CACHE_ROWS 500
+
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -113,7 +114,7 @@ void PairEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   d_numneigh = k_list->d_numneigh;
   d_neighbors = k_list->d_neighbors;
   d_ilist = k_list->d_ilist;
-  int inum = list->inum;
+  inum = list->inum;
 
   need_dup = lmp->kokkos->need_dup<DeviceType>();
   if (need_dup) {
@@ -147,15 +148,15 @@ void PairEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
     if (neighflag == HALF) {
       if (newton_pair) {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelA<HALF,1> >(0,inum),*this);
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelA<HALF,1>>(0,inum),*this);
       } else {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelA<HALF,0> >(0,inum),*this);
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelA<HALF,0>>(0,inum),*this);
       }
     } else if (neighflag == HALFTHREAD) {
       if (newton_pair) {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelA<HALFTHREAD,1> >(0,inum),*this);
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelA<HALFTHREAD,1>>(0,inum),*this);
       } else {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelA<HALFTHREAD,0> >(0,inum),*this);
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelA<HALFTHREAD,0>>(0,inum),*this);
       }
     }
 
@@ -173,18 +174,22 @@ void PairEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     // compute kernel B
 
     if (eflag)
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelB<1> >(0,inum),*this,ev);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelB<1>>(0,inum),*this,ev);
     else
-      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelB<0> >(0,inum),*this);
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelB<0>>(0,inum),*this);
 
   } else if (neighflag == FULL) {
 
     // compute kernel AB
 
     if (eflag)
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelAB<1> >(0,inum),*this,ev);
+      Kokkos::parallel_reduce(
+             Kokkos::RangePolicy<DeviceType,TagPairEAMKernelAB<1>>(0,inum),
+             *this,ev);
     else
-      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelAB<0> >(0,inum),*this);
+      Kokkos::parallel_for(
+            policyInstance<TagPairEAMKernelAB<0>>::get(inum),
+            *this);
   }
 
   if (eflag) {
@@ -203,41 +208,65 @@ void PairEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   if (evflag) {
     if (neighflag == HALF) {
       if (newton_pair) {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelC<HALF,1,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMKernelC<HALF,1,1>>(0,inum),
+              *this,ev);
       } else {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelC<HALF,0,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMKernelC<HALF,0,1>>(0,inum),
+              *this,ev);
       }
     } else if (neighflag == HALFTHREAD) {
       if (newton_pair) {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelC<HALFTHREAD,1,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMKernelC<HALFTHREAD,1,1>>(0,inum),
+              *this,ev);
       } else {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelC<HALFTHREAD,0,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMKernelC<HALFTHREAD,0,1>>(0,inum),
+              *this,ev);
       }
     } else if (neighflag == FULL) {
       if (newton_pair) {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelC<FULL,1,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMKernelC<FULL,1,1>>(0,inum),
+              *this,ev);
       } else {
-        Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelC<FULL,0,1> >(0,inum),*this,ev);
+        Kokkos::parallel_reduce(
+              Kokkos::RangePolicy<DeviceType,TagPairEAMKernelC<FULL,0,1>>(0,inum),
+              *this,ev);
       }
     }
   } else {
     if (neighflag == HALF) {
       if (newton_pair) {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelC<HALF,1,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMKernelC<HALF,1,0>>::get(inum),
+              *this);
       } else {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelC<HALF,0,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMKernelC<HALF,0,0>>::get(inum),
+              *this);
       }
     } else if (neighflag == HALFTHREAD) {
       if (newton_pair) {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelC<HALFTHREAD,1,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMKernelC<HALFTHREAD,1,0>>::get(inum),
+              *this);
       } else {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelC<HALFTHREAD,0,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMKernelC<HALFTHREAD,0,0>>::get(inum),
+              *this);
       }
     } else if (neighflag == FULL) {
       if (newton_pair) {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelC<FULL,1,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMKernelC<FULL,1,0>>::get(inum),
+              *this);
       } else {
-        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairEAMKernelC<FULL,0,0> >(0,inum),*this);
+        Kokkos::parallel_for(
+              policyInstance<TagPairEAMKernelC<FULL,0,0>>::get(inum),
+              *this);
       }
     }
   }
@@ -583,6 +612,7 @@ template<class DeviceType>
 template<int EFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelB<EFLAG>, const int &ii, EV_FLOAT& ev) const {
+
   // fp = derivative of embedding energy at each atom
   // phi = embedding energy at each atom
   // if rho > rhomax (e.g. due to close approach of two atoms),
@@ -802,6 +832,235 @@ void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PA
 
 /* ---------------------------------------------------------------------- */
 
+////Specialisation for Neighborlist types Half, HalfThread, Full
+template<class DeviceType>
+template<int EFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelAB<EFLAG>,
+                                     const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member,
+                                     EV_FLOAT& ev) const {
+  int ii = team_member.league_rank()*team_member.team_size() + team_member.team_rank();
+  // rho = density at each atom
+  // loop over neighbors of my atoms
+  const int m_max = d_rhor_spline.extent_int(1);
+  const int j_max = t_ffloat_2d_n7::static_extent(2);
+  const int d_rhor_spline_cached = (m_max > MAX_CACHE_ROWS) ? 0 : 1;
+  Kokkos::View<double*[t_ffloat_2d_n7::static_extent(2)], typename DeviceType::scratch_memory_space,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>> A(team_member.team_scratch(0), MAX_CACHE_ROWS);
+
+  if (d_rhor_spline_cached) {
+    for(int i = team_member.team_rank(); i < m_max*j_max; i+= team_member.team_size()) {
+      int j = i%j_max;
+      int m = i/j_max;
+      A(m,j) = d_rhor_spline(0,m,j);
+    }
+    team_member.team_barrier();
+  }
+  if (ii < inum) {
+    const int i = d_ilist[ii];
+    const X_FLOAT xtmp = x(i,0);
+    const X_FLOAT ytmp = x(i,1);
+    const X_FLOAT ztmp = x(i,2);
+    const int itype = type(i);
+
+    const int jnum = d_numneigh[i];
+
+    F_FLOAT rhotmp = 0.0;
+
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = d_neighbors(i,jj);
+      j &= NEIGHMASK;
+
+      const X_FLOAT delx = xtmp - x(j,0);
+      const X_FLOAT dely = ytmp - x(j,1);
+      const X_FLOAT delz = ztmp - x(j,2);
+      const int jtype = type(j);
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cutforcesq) {
+        F_FLOAT p = sqrt(rsq)*rdr + 1.0;
+        int m = static_cast<int> (p);
+        m = MIN(m,nr-1);
+        p -= m;
+        p = MIN(p,1.0);
+        const int d_type2rhor_ji = d_type2rhor(jtype,itype);
+        if (d_type2rhor_ji == 0 && d_rhor_spline_cached == 1) {
+          rhotmp += ((A(m,3)*p + A(m,4))*p +
+                       A(m,5))*p + A(m,6);
+        } else
+          rhotmp += ((d_rhor_spline(d_type2rhor_ji,m,3)*p + d_rhor_spline(d_type2rhor_ji,m,4))*p +
+                      d_rhor_spline(d_type2rhor_ji,m,5))*p + d_rhor_spline(d_type2rhor_ji,m,6);
+      }
+
+    }
+    d_rho[i] += rhotmp;
+
+    // fp = derivative of embedding energy at each atom
+    // phi = embedding energy at each atom
+    // if rho > rhomax (e.g. due to close approach of two atoms),
+    //   will exceed table, so add linear term to conserve energy
+
+    F_FLOAT p = d_rho[i]*rdrho + 1.0;
+    int m = static_cast<int> (p);
+    m = MAX(1,MIN(m,nrho-1));
+    p -= m;
+    p = MIN(p,1.0);
+    const int d_type2frho_i = d_type2frho[itype];
+    d_fp[i] = (d_frho_spline(d_type2frho_i,m,0)*p + d_frho_spline(d_type2frho_i,m,1))*p + d_frho_spline(d_type2frho_i,m,2);
+    if (EFLAG) {
+      F_FLOAT phi = ((d_frho_spline(d_type2frho_i,m,3)*p + d_frho_spline(d_type2frho_i,m,4))*p +
+                      d_frho_spline(d_type2frho_i,m,5))*p + d_frho_spline(d_type2frho_i,m,6);
+      if (d_rho[i] > rhomax) phi += d_fp[i] * (d_rho[i]-rhomax);
+      if (eflag_global) ev.evdwl += phi;
+      if (eflag_atom) d_eatom[i] += phi;
+    }
+  }
+}
+
+template<class DeviceType>
+template<int EFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelAB<EFLAG>,
+                                           const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member) const {
+  EV_FLOAT ev;
+  this->template operator()<EFLAG>(TagPairEAMKernelAB<EFLAG>(), team_member, ev);
+}
+
+/* ---------------------------------------------------------------------- */
+
+////Specialisation for Neighborlist types Half, HalfThread, Full
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>,
+                                           const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member,
+                                           EV_FLOAT& ev) const {
+
+  int ii = team_member.league_rank()*team_member.team_size() + team_member.team_rank();
+
+  // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
+
+  auto v_f = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f);
+  auto a_f = v_f.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+
+  const int m_max = d_z2r_spline.extent_int(1);
+  const int j_max = t_ffloat_2d_n7::static_extent(2);
+  const int d_z2r_spline_cached = (m_max > MAX_CACHE_ROWS) ? 0 : 1;
+  Kokkos::View<double*[t_ffloat_2d_n7::static_extent(2)], typename DeviceType::scratch_memory_space,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>> A(team_member.team_scratch(0), MAX_CACHE_ROWS);
+
+  if (d_z2r_spline_cached) {
+    for(int i = team_member.team_rank(); i < m_max*j_max; i+= team_member.team_size()) {
+      int j = i%j_max;
+      int m = i/j_max;
+      A(m,j) = d_z2r_spline(0,m,j);
+    }
+    team_member.team_barrier();
+  }
+  if (ii < inum) {
+    const int i = d_ilist[ii];
+    const X_FLOAT xtmp = x(i,0);
+    const X_FLOAT ytmp = x(i,1);
+    const X_FLOAT ztmp = x(i,2);
+    const int itype = type(i);
+
+    const int jnum = d_numneigh[i];
+
+    F_FLOAT fxtmp = 0.0;
+    F_FLOAT fytmp = 0.0;
+    F_FLOAT fztmp = 0.0;
+
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = d_neighbors(i,jj);
+      j &= NEIGHMASK;
+      const X_FLOAT delx = xtmp - x(j,0);
+      const X_FLOAT dely = ytmp - x(j,1);
+      const X_FLOAT delz = ztmp - x(j,2);
+      const int jtype = type(j);
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cutforcesq) {
+        const F_FLOAT r = sqrt(rsq);
+        F_FLOAT p = r*rdr + 1.0;
+        int m = static_cast<int> (p);
+        m = MIN(m,nr-1);
+        p -= m;
+        p = MIN(p,1.0);
+
+        // rhoip = derivative of (density at atom j due to atom i)
+        // rhojp = derivative of (density at atom i due to atom j)
+        // phi = pair potential energy
+        // phip = phi'
+        // z2 = phi * r
+        // z2p = (phi * r)' = (phi' r) + phi
+        // psip needs both fp[i] and fp[j] terms since r_ij appears in two
+        //   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
+        //   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
+
+        const int d_type2rhor_ij = d_type2rhor(itype,jtype);
+        const F_FLOAT rhoip = (d_rhor_spline(d_type2rhor_ij,m,0)*p + d_rhor_spline(d_type2rhor_ij,m,1))*p +
+                             d_rhor_spline(d_type2rhor_ij,m,2);
+        const int d_type2rhor_ji = d_type2rhor(jtype,itype);
+        const F_FLOAT rhojp = (d_rhor_spline(d_type2rhor_ji,m,0)*p + d_rhor_spline(d_type2rhor_ji,m,1))*p +
+                               d_rhor_spline(d_type2rhor_ji,m,2);
+        const int d_type2z2r_ij = d_type2z2r(itype,jtype);
+
+        const auto have_cache = (d_z2r_spline_cached == 1) && (0 == d_type2z2r_ij);
+        const auto z2r_spline_3 = (have_cache) ? A(m,3) : d_z2r_spline(d_type2z2r_ij,m,3);
+        const auto z2r_spline_4 = (have_cache) ? A(m,4) : d_z2r_spline(d_type2z2r_ij,m,4);
+        const auto z2r_spline_5 = (have_cache) ? A(m,5) : d_z2r_spline(d_type2z2r_ij,m,5);
+        const auto z2r_spline_6 = (have_cache) ? A(m,6) : d_z2r_spline(d_type2z2r_ij,m,6);
+
+        const F_FLOAT z2p = (3.0*rdr*z2r_spline_3*p + 2.0*rdr*z2r_spline_4)*p +
+                             rdr*z2r_spline_5; // the rdr and the factors of 3.0 and 2.0 come out of the interpolate function
+        const F_FLOAT z2 = ((z2r_spline_3*p + z2r_spline_4)*p +
+                             z2r_spline_5)*p + z2r_spline_6;
+
+        const F_FLOAT recip = 1.0/r;
+        const F_FLOAT phi = z2*recip;
+        const F_FLOAT phip = z2p*recip - phi*recip;
+        const F_FLOAT psip = d_fp[i]*rhojp + d_fp[j]*rhoip + phip;
+        const F_FLOAT fpair = -psip*recip;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+
+        if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
+          a_f(j,0) -= delx*fpair;
+          a_f(j,1) -= dely*fpair;
+          a_f(j,2) -= delz*fpair;
+        }
+
+        if (EVFLAG) {
+          if (eflag) {
+            ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<nlocal)))?1.0:0.5)*phi;
+          }
+
+          if (vflag_either || eflag_atom) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,phi,fpair,delx,dely,delz);
+        }
+
+      }
+    }
+
+    a_f(i,0) += fxtmp;
+    a_f(i,1) += fytmp;
+    a_f(i,2) += fztmp;
+  }
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>,
+                /*const int &ii*/
+                const typename Kokkos::TeamPolicy<DeviceType>::member_type& team_member) const {
+  EV_FLOAT ev;
+  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(), team_member, ev);
+}
+
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 template<int NEIGHFLAG, int NEWTON_PAIR>
 KOKKOS_INLINE_FUNCTION
@@ -898,6 +1157,37 @@ void PairEAMKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const int &
   }
 }
 
+/* ---------------------------------------------------------------------- */
+
+template<typename DeviceType>
+template<class TAG>
+struct PairEAMKokkos<DeviceType>::policyInstance {
+  KOKKOS_INLINE_FUNCTION
+  static auto get(int inum) {
+    auto policy = Kokkos::RangePolicy<DeviceType, TAG>(0,inum);
+    return policy;
+  }
+};
+
+#ifdef KOKKOS_ENABLE_HIP
+template<>
+template<class TAG>
+struct PairEAMKokkos<Kokkos::Experimental::HIP>::policyInstance {
+  KOKKOS_INLINE_FUNCTION
+  static auto get(int inum) {
+    static_assert(t_ffloat_2d_n7::static_extent(2) == 7,
+                  "Breaking assumption of spline dim for KernelAB and KernelC scratch caching");
+
+    auto policy = Kokkos::TeamPolicy<Kokkos::Experimental::HIP,TAG>((inum+1023)/1024, 1024)
+                           .set_scratch_size(0,
+                                Kokkos::PerTeam(MAX_CACHE_ROWS*7*sizeof(double)));
+    return policy;
+  }
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
 namespace LAMMPS_NS {
 template class PairEAMKokkos<LMPDeviceType>;
 #ifdef LMP_KOKKOS_GPU
diff --git a/src/KOKKOS/pair_eam_kokkos.h b/src/KOKKOS/pair_eam_kokkos.h
index 006dceb1f2..9d066d40a0 100644
--- a/src/KOKKOS/pair_eam_kokkos.h
+++ b/src/KOKKOS/pair_eam_kokkos.h
@@ -60,6 +60,7 @@ class PairEAMKokkos : public PairEAM, public KokkosBase {
   void compute(int, int) override;
   void init_style() override;
 
+
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairEAMPackForwardComm, const int&) const;
 
@@ -89,6 +90,14 @@ class PairEAMKokkos : public PairEAM, public KokkosBase {
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairEAMKernelAB<EFLAG>, const int&) const;
 
+  template<int EFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairEAMKernelAB<EFLAG>, const typename Kokkos::TeamPolicy<DeviceType>::member_type&, EV_FLOAT&) const;
+
+  template<int EFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairEAMKernelAB<EFLAG>, const typename Kokkos::TeamPolicy<DeviceType>::member_type&) const;
+
   template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&, EV_FLOAT&) const;
@@ -97,6 +106,14 @@ class PairEAMKokkos : public PairEAM, public KokkosBase {
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;
 
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const typename Kokkos::TeamPolicy<DeviceType>::member_type&, EV_FLOAT&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const typename Kokkos::TeamPolicy<DeviceType>::member_type&) const;
+
   template<int NEIGHFLAG, int NEWTON_PAIR>
   KOKKOS_INLINE_FUNCTION
   void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
@@ -121,7 +138,7 @@ class PairEAMKokkos : public PairEAM, public KokkosBase {
   typename AT::t_efloat_1d d_eatom;
   typename AT::t_virial_array d_vatom;
 
-  int need_dup;
+  int need_dup,inum;
 
   using KKDeviceType = typename KKDevice<DeviceType>::value;
 
@@ -162,6 +179,9 @@ class PairEAMKokkos : public PairEAM, public KokkosBase {
   void file2array() override;
   void array2spline() override;
 
+  template<class TAG>
+  struct policyInstance;
+
   typename AT::t_neighbors_2d d_neighbors;
   typename AT::t_int_1d d_ilist;
   typename AT::t_int_1d d_numneigh;
diff --git a/src/MC/fix_gcmc.cpp b/src/MC/fix_gcmc.cpp
index 5d1e318db2..ff8c5eb9c9 100644
--- a/src/MC/fix_gcmc.cpp
+++ b/src/MC/fix_gcmc.cpp
@@ -2351,12 +2351,6 @@ double FixGCMC::energy_full()
 
   if (force->kspace) force->kspace->compute(eflag,vflag);
 
-  // unlike Verlet, not performing a reverse_comm() or forces here
-  // b/c GCMC does not care about forces
-  // don't think it will mess up energy due to any post_force() fixes
-  // but Modify::pre_reverse() is needed for INTEL
-
-  if (modify->n_pre_reverse) modify->pre_reverse(eflag,vflag);
   if (modify->n_post_force_any) modify->post_force(vflag);
 
   // NOTE: all fixes with energy_global_flag set and which
diff --git a/src/MC/fix_widom.cpp b/src/MC/fix_widom.cpp
index a98f29da5e..cc2f1bc94d 100644
--- a/src/MC/fix_widom.cpp
+++ b/src/MC/fix_widom.cpp
@@ -1050,13 +1050,7 @@ double FixWidom::energy_full()
 
   if (force->kspace) force->kspace->compute(eflag,vflag);
 
-  // unlike Verlet, not performing a reverse_comm() or forces here
-  // b/c Widom does not care about forces
-  // don't think it will mess up energy due to any post_force() fixes
-  // but Modify::pre_reverse() is needed for INTEL
-
-  if (modify->n_pre_reverse) modify->pre_reverse(eflag,vflag);
-  if (modify->n_pre_force) modify->pre_force(vflag);
+  if (modify->n_post_force_any) modify->post_force(vflag);
 
   // NOTE: all fixes with energy_global_flag set and which
   //   operate at pre_force() or post_force()
diff --git a/src/NETCDF/dump_netcdf.cpp b/src/NETCDF/dump_netcdf.cpp
index 0115c0bbe9..a4699897a3 100644
--- a/src/NETCDF/dump_netcdf.cpp
+++ b/src/NETCDF/dump_netcdf.cpp
@@ -442,11 +442,11 @@ void DumpNetCDF::openfile()
         const int nfield = *th->get_nfield();
 
         for (int i = 0; i < nfield; i++) {
-          if (fields[i].type == multitype::DOUBLE) {
+          if (fields[i].type == multitype::LAMMPS_DOUBLE) {
             NCERRX( nc_def_var(ncid, keywords[i].c_str(), type_nc_real, 1, dims, &thermovar[i]), keywords[i].c_str() );
-          } else if (fields[i].type == multitype::INT) {
+          } else if (fields[i].type == multitype::LAMMPS_INT) {
             NCERRX( nc_def_var(ncid, keywords[i].c_str(), NC_INT, 1, dims, &thermovar[i]), keywords[i].c_str() );
-          } else if (fields[i].type == multitype::BIGINT) {
+          } else if (fields[i].type == multitype::LAMMPS_INT64) {
             NCERRX( nc_def_var(ncid, keywords[i].c_str(), NC_INT64, 1, dims, &thermovar[i]), keywords[i].c_str() );
           }
         }
@@ -624,11 +624,11 @@ void DumpNetCDF::write()
     int nfield = *th->get_nfield();
     for (int i = 0; i < nfield; i++) {
       if (filewriter) {
-        if (fields[i].type == multitype::DOUBLE) {
+        if (fields[i].type == multitype::LAMMPS_DOUBLE) {
           NCERRX( nc_put_var1_double(ncid, thermovar[i], start, &fields[i].data.d), keywords[i].c_str() );
-        } else if (fields[i].type == multitype::INT) {
+        } else if (fields[i].type == multitype::LAMMPS_INT) {
           NCERRX( nc_put_var1_int(ncid, thermovar[i], start, &fields[i].data.i), keywords[i].c_str() );
-        } else if (fields[i].type == multitype::BIGINT) {
+        } else if (fields[i].type == multitype::LAMMPS_INT64) {
           NCERRX( nc_put_var1_bigint(ncid, thermovar[i], start, &fields[i].data.b), keywords[i].c_str() );
         }
       }
diff --git a/src/NETCDF/dump_netcdf_mpiio.cpp b/src/NETCDF/dump_netcdf_mpiio.cpp
index f9382dacef..2407678386 100644
--- a/src/NETCDF/dump_netcdf_mpiio.cpp
+++ b/src/NETCDF/dump_netcdf_mpiio.cpp
@@ -432,11 +432,11 @@ void DumpNetCDFMPIIO::openfile()
       const int nfield = *th->get_nfield();
 
       for (int i = 0; i < nfield; i++) {
-        if (fields[i].type == multitype::DOUBLE) {
+        if (fields[i].type == multitype::LAMMPS_DOUBLE) {
           NCERRX( ncmpi_def_var(ncid, keywords[i].c_str(), type_nc_real, 1, dims, &thermovar[i]), keywords[i].c_str() );
-        } else if (fields[i].type == multitype::INT) {
+        } else if (fields[i].type == multitype::LAMMPS_INT) {
           NCERRX( ncmpi_def_var(ncid, keywords[i].c_str(), NC_INT, 1, dims, &thermovar[i]), keywords[i].c_str() );
-        } else if (fields[i].type == multitype::BIGINT) {
+        } else if (fields[i].type == multitype::LAMMPS_INT64) {
           NCERRX( ncmpi_def_var(ncid, keywords[i].c_str(), NC_INT64, 1, dims, &thermovar[i]), keywords[i].c_str() );
         }
       }
@@ -617,11 +617,11 @@ void DumpNetCDFMPIIO::write()
     int nfield = *th->get_nfield();
     for (int i = 0; i < nfield; i++) {
       if (filewriter) {
-        if (fields[i].type == multitype::DOUBLE) {
+        if (fields[i].type == multitype::LAMMPS_DOUBLE) {
           NCERRX( ncmpi_put_var1_double(ncid, thermovar[i], start, &fields[i].data.d), keywords[i].c_str() );
-        } else if (fields[i].type == multitype::INT) {
+        } else if (fields[i].type == multitype::LAMMPS_INT) {
           NCERRX( ncmpi_put_var1_int(ncid, thermovar[i], start, &fields[i].data.i), keywords[i].c_str() );
-        } else if (fields[i].type == multitype::BIGINT) {
+        } else if (fields[i].type == multitype::LAMMPS_INT64) {
           NCERRX( ncmpi_put_var1_bigint(ncid, thermovar[i], start, &fields[i].data.b), keywords[i].c_str() );
         }
       }
diff --git a/src/REACTION/fix_bond_react.cpp b/src/REACTION/fix_bond_react.cpp
index 784c163f70..83442ecb25 100644
--- a/src/REACTION/fix_bond_react.cpp
+++ b/src/REACTION/fix_bond_react.cpp
@@ -131,6 +131,7 @@ FixBondReact::FixBondReact(LAMMPS *lmp, int narg, char **arg) :
   global_freq = 1;
   extvector = 0;
   rxnID = 0;
+  cuff = 1;
   maxnconstraints = 0;
   narrhenius = 0;
   status = PROCEED;
@@ -250,8 +251,9 @@ FixBondReact::FixBondReact(LAMMPS *lmp, int narg, char **arg) :
   memory->create(ghostly_rxn_count,nreacts,"bond/react:ghostly_rxn_count");
   memory->create(reaction_count_total,nreacts,"bond/react:reaction_count_total");
 
+  rescale_charges_anyflag = 0;
   for (int i = 0; i < nreacts; i++) {
-    fraction[i] = 1;
+    fraction[i] = 1.0;
     seed[i] = 12345;
     max_rxn[i] = INT_MAX;
     for (int j = 0; j < 3; j++)
@@ -261,7 +263,7 @@ FixBondReact::FixBondReact(LAMMPS *lmp, int narg, char **arg) :
     rescale_charges_flag[i] = 0;
     create_atoms_flag[i] = 0;
     modify_create_fragid[i] = -1;
-    overlapsq[i] = 0;
+    overlapsq[i] = 0.0;
     molecule_keyword[i] = OFF;
     nconstraints[i] = 0;
     // set default limit duration to 60 timesteps
@@ -388,8 +390,16 @@ FixBondReact::FixBondReact(LAMMPS *lmp, int narg, char **arg) :
         if (iarg+2 > narg) error->all(FLERR,"Illegal fix bond/react command: "
                                       "'rescale_charges' has too few arguments");
         if (strcmp(arg[iarg+1],"no") == 0) rescale_charges_flag[rxn] = 0; //default
-        else if (strcmp(arg[iarg+1],"yes") == 0) rescale_charges_flag[rxn] = 1;
-        else error->one(FLERR,"Bond/react: Illegal option for 'rescale_charges' keyword");
+        else if (strcmp(arg[iarg+1],"yes") == 0) {
+          if (!atom->q_flag) error->all(FLERR,"Illegal fix bond/react command: cannot use "
+                                      "'rescale_charges' without atomic charges enabled");
+          twomol = atom->molecules[reacted_mol[rxn]];
+          if (!twomol->qflag) error->all(FLERR,"Illegal fix bond/react command: cannot use "
+                                      "'rescale_charges' without Charges section in post-reaction template");
+          rescale_charges_flag[rxn] = 1; // overloaded below to also indicate number of atoms to update
+          rescale_charges_anyflag = 1;
+          cuff = 2; // index shift for extra values carried around by mega_gloves
+        } else error->one(FLERR,"Bond/react: Illegal option for 'rescale_charges' keyword");
         iarg += 2;
       } else if (strcmp(arg[iarg],"molecule") == 0) {
         if (iarg+2 > narg) error->all(FLERR,"Illegal fix bond/react command: "
@@ -442,8 +452,10 @@ FixBondReact::FixBondReact(LAMMPS *lmp, int narg, char **arg) :
   memory->create(delete_atoms,max_natoms,nreacts,"bond/react:delete_atoms");
   memory->create(create_atoms,max_natoms,nreacts,"bond/react:create_atoms");
   memory->create(chiral_atoms,max_natoms,6,nreacts,"bond/react:chiral_atoms");
+  memory->create(mol_total_charge,nreacts,"bond/react:mol_total_charge");
 
   for (int j = 0; j < nreacts; j++) {
+    mol_total_charge[j] = 0.0;
     for (int i = 0; i < max_natoms; i++) {
       edge[i][j] = 0;
       custom_charges[i][j] = 1; // update all partial charges by default
@@ -483,6 +495,21 @@ FixBondReact::FixBondReact(LAMMPS *lmp, int narg, char **arg) :
     if (custom_charges_fragid[i] >= 0) CustomCharges(custom_charges_fragid[i],i);
   }
 
+  // charge rescaling values must be calculated after calling CustomCharges
+  for (int myrxn = 0; myrxn < nreacts; myrxn++) {
+    if (rescale_charges_flag[myrxn]) {
+      rescale_charges_flag[myrxn] = 0; // will now store number of updated atoms
+      twomol = atom->molecules[reacted_mol[myrxn]];
+      for (int j = 0; j < twomol->natoms; j++) {
+        int jj = equivalences[j][1][myrxn]-1;
+        if (custom_charges[jj][myrxn] == 1 && delete_atoms[jj][myrxn] == 0) {
+          mol_total_charge[myrxn] += twomol->q[j];
+          rescale_charges_flag[myrxn]++;
+        }
+      }
+    }
+  }
+
   // get the names of per-atom variables needed by 'rxn' functions of custom constraint
   customvarnames();
 
@@ -605,6 +632,7 @@ FixBondReact::~FixBondReact()
   memory->destroy(delete_atoms);
   memory->destroy(create_atoms);
   memory->destroy(chiral_atoms);
+  memory->destroy(mol_total_charge);
   if (vvec != nullptr) memory->destroy(vvec);
 
   memory->destroy(rxn_name);
@@ -1282,11 +1310,11 @@ void FixBondReact::superimpose_algorithm()
   memory->create(restore_pt,MAXGUESS,4,"bond/react:restore_pt");
   memory->create(pioneers,max_natoms,"bond/react:pioneers");
   memory->create(restore,max_natoms,MAXGUESS*4,"bond/react:restore");
-  memory->create(my_mega_glove,max_natoms+1,allnattempt,"bond/react:local_mega_glove");
+  memory->create(my_mega_glove,max_natoms+cuff,allnattempt,"bond/react:my_mega_glove");
 
-  for (int i = 0; i < max_natoms+1; i++)
+  for (int i = 0; i < max_natoms+cuff; i++)
     for (int j = 0; j < allnattempt; j++)
-      my_mega_glove[i][j] = 0;
+      my_mega_glove[i][j] = 0.0;
 
   attempted_rxn = 1;
 
@@ -1336,9 +1364,10 @@ void FixBondReact::superimpose_algorithm()
             status = REJECT;
           } else {
             status = ACCEPT;
-            my_mega_glove[0][my_num_mega] = rxnID;
+            my_mega_glove[0][my_num_mega] = (double) rxnID;
+            if (rescale_charges_flag[rxnID]) my_mega_glove[1][my_num_mega] = get_totalcharge();
             for (int i = 0; i < onemol->natoms; i++) {
-              my_mega_glove[i+1][my_num_mega] = glove[i][1];
+              my_mega_glove[i+cuff][my_num_mega] = (double) glove[i][1];
             }
             my_num_mega++;
           }
@@ -1384,9 +1413,10 @@ void FixBondReact::superimpose_algorithm()
           if (fraction[rxnID] < 1.0 &&
               random[rxnID]->uniform() >= fraction[rxnID]) status = REJECT;
           else {
-            my_mega_glove[0][my_num_mega] = rxnID;
+            my_mega_glove[0][my_num_mega] = (double) rxnID;
+            if (rescale_charges_flag[rxnID]) my_mega_glove[1][my_num_mega] = get_totalcharge();
             for (int i = 0; i < onemol->natoms; i++) {
-              my_mega_glove[i+1][my_num_mega] = glove[i][1];
+              my_mega_glove[i+cuff][my_num_mega] = (double) glove[i][1];
             }
             my_num_mega++;
           }
@@ -1405,13 +1435,13 @@ void FixBondReact::superimpose_algorithm()
 
   global_megasize = 0;
 
-  memory->create(local_mega_glove,max_natoms+1,my_num_mega,"bond/react:local_mega_glove");
-  memory->create(ghostly_mega_glove,max_natoms+1,my_num_mega,"bond/react:ghostly_mega_glove");
+  memory->create(local_mega_glove,max_natoms+cuff,my_num_mega,"bond/react:local_mega_glove");
+  memory->create(ghostly_mega_glove,max_natoms+cuff,my_num_mega,"bond/react:ghostly_mega_glove");
 
-  for (int i = 0; i < max_natoms+1; i++) {
+  for (int i = 0; i < max_natoms+cuff; i++) {
     for (int j = 0; j < my_num_mega; j++) {
-      local_mega_glove[i][j] = 0;
-      ghostly_mega_glove[i][j] = 0;
+      local_mega_glove[i][j] = 0.0;
+      ghostly_mega_glove[i][j] = 0.0;
     }
   }
 
@@ -2186,6 +2216,24 @@ double FixBondReact::get_temperature(tagint **myglove, int row_offset, int col)
   return t;
 }
 
+/* ----------------------------------------------------------------------
+compute sum of partial charges in rxn site, for updated atoms
+note: currently uses global rxnID and onemol variables
+------------------------------------------------------------------------- */
+
+double FixBondReact::get_totalcharge()
+{
+  int j,jj,ilocal;
+  double *q = atom->q;
+  double sim_total_charge = 0.0;
+  for (j = 0; j < onemol->natoms; j++) {
+    jj = equivalences[j][1][rxnID]-1;
+    if (custom_charges[jj][rxnID] == 1)
+      sim_total_charge += q[atom->map(glove[jj][1])];
+  }
+  return sim_total_charge;
+}
+
 /* ----------------------------------------------------------------------
 get per-atom variable names used by custom constraint
 ------------------------------------------------------------------------- */
@@ -2673,18 +2721,18 @@ void FixBondReact::dedup_mega_gloves(int dedup_mode)
     dedup_size = global_megasize;
   }
 
-  tagint **dedup_glove;
-  memory->create(dedup_glove,max_natoms+1,dedup_size,"bond/react:dedup_glove");
+  double **dedup_glove;
+  memory->create(dedup_glove,max_natoms+cuff,dedup_size,"bond/react:dedup_glove");
 
   if (dedup_mode == LOCAL) {
     for (int i = 0; i < dedup_size; i++) {
-      for (int j = 0; j < max_natoms+1; j++) {
+      for (int j = 0; j < max_natoms+cuff; j++) {
         dedup_glove[j][i] = my_mega_glove[j][i];
       }
     }
   } else if (dedup_mode == GLOBAL) {
     for (int i = 0; i < dedup_size; i++) {
-      for (int j = 0; j < max_natoms+1; j++) {
+      for (int j = 0; j < max_natoms+cuff; j++) {
         dedup_glove[j][i] = global_mega_glove[j][i];
       }
     }
@@ -2700,16 +2748,16 @@ void FixBondReact::dedup_mega_gloves(int dedup_mode)
   // let's randomly mix up our reaction instances first
   // then we can feel okay about ignoring ones we've already deleted (or accepted)
   // based off std::shuffle
-  int *temp_rxn = new int[max_natoms+1];
+  double *temp_rxn = new double[max_natoms+cuff];
   for (int i = dedup_size-1; i > 0; --i) { //dedup_size
     // choose random entry to swap current one with
     int k = floor(random[0]->uniform()*(i+1));
 
     // swap entries
-    for (int j = 0; j < max_natoms+1; j++)
+    for (int j = 0; j < max_natoms+cuff; j++)
       temp_rxn[j] = dedup_glove[j][i];
 
-    for (int j = 0; j < max_natoms+1; j++) {
+    for (int j = 0; j < max_natoms+cuff; j++) {
       dedup_glove[j][i] = dedup_glove[j][k];
       dedup_glove[j][k] = temp_rxn[j];
     }
@@ -2721,13 +2769,13 @@ void FixBondReact::dedup_mega_gloves(int dedup_mode)
       int myrxnid1 = dedup_glove[0][i];
       onemol = atom->molecules[unreacted_mol[myrxnid1]];
       for (int j = 0; j < onemol->natoms; j++) {
-        int check1 = dedup_glove[j+1][i];
+        int check1 = dedup_glove[j+cuff][i];
         for (int ii = i + 1; ii < dedup_size; ii++) {
           if (dedup_mask[ii] == 0) {
             int myrxnid2 = dedup_glove[0][ii];
             twomol = atom->molecules[unreacted_mol[myrxnid2]];
             for (int jj = 0; jj < twomol->natoms; jj++) {
-              int check2 = dedup_glove[jj+1][ii];
+              int check2 = dedup_glove[jj+cuff][ii];
               if (check2 == check1) {
                 dedup_mask[ii] = 1;
                 break;
@@ -2745,7 +2793,7 @@ void FixBondReact::dedup_mega_gloves(int dedup_mode)
     int my_new_megasize = 0;
     for (int i = 0; i < my_num_mega; i++) {
       if (dedup_mask[i] == 0) {
-        for (int j = 0; j < max_natoms+1; j++) {
+        for (int j = 0; j < max_natoms+cuff; j++) {
           my_mega_glove[j][my_new_megasize] = dedup_glove[j][i];
         }
         my_new_megasize++;
@@ -2760,8 +2808,8 @@ void FixBondReact::dedup_mega_gloves(int dedup_mode)
     int new_global_megasize = 0;
     for (int i = 0; i < global_megasize; i++) {
       if (dedup_mask[i] == 0) {
-        ghostly_rxn_count[dedup_glove[0][i]]++;
-        for (int j = 0; j < max_natoms + 1; j++) {
+        ghostly_rxn_count[(int) dedup_glove[0][i]]++;
+        for (int j = 0; j < max_natoms + cuff; j++) {
           global_mega_glove[j][new_global_megasize] = dedup_glove[j][i];
         }
         new_global_megasize++;
@@ -2830,7 +2878,7 @@ void FixBondReact::glove_ghostcheck()
     local_rxn_count[i] = 0;
 
   for (int i = 0; i < my_num_mega; i++) {
-    rxnID = my_mega_glove[0][i];
+    rxnID = (int) my_mega_glove[0][i];
     onemol = atom->molecules[unreacted_mol[rxnID]];
     int ghostly = 0;
   #if !defined(MPI_STUBS)
@@ -2839,7 +2887,7 @@ void FixBondReact::glove_ghostcheck()
         ghostly = 1;
       } else {
         for (int j = 0; j < onemol->natoms; j++) {
-          int ilocal = atom->map(my_mega_glove[j+1][i]);
+          int ilocal = atom->map((tagint) my_mega_glove[j+cuff][i]);
           if (ilocal >= atom->nlocal || localsendlist[ilocal] == 1) {
             ghostly = 1;
             break;
@@ -2852,15 +2900,13 @@ void FixBondReact::glove_ghostcheck()
   #endif
 
     if (ghostly == 1) {
-      ghostly_mega_glove[0][ghostly_num_mega] = rxnID;
-      for (int j = 0; j < onemol->natoms+1; j++) {
+      for (int j = 0; j < onemol->natoms+cuff; j++) {
         ghostly_mega_glove[j][ghostly_num_mega] = my_mega_glove[j][i];
       }
       ghostly_num_mega++;
     } else {
-      local_mega_glove[0][local_num_mega] = rxnID;
       local_rxn_count[rxnID]++;
-      for (int j = 0; j < onemol->natoms+1; j++) {
+      for (int j = 0; j < onemol->natoms+cuff; j++) {
         local_mega_glove[j][local_num_mega] = my_mega_glove[j][i];
       }
       local_num_mega++;
@@ -2899,23 +2945,23 @@ void FixBondReact::ghost_glovecast()
   }
   MPI_Allgather(&start, 1, MPI_INT, allstarts, 1, MPI_INT, world);
   MPI_Datatype columnunsized, column;
-  int sizes[2]    = {max_natoms+1, global_megasize};
-  int subsizes[2] = {max_natoms+1, 1};
+  int sizes[2]    = {max_natoms+cuff, global_megasize};
+  int subsizes[2] = {max_natoms+cuff, 1};
   int starts[2]   = {0,0};
   MPI_Type_create_subarray (2, sizes, subsizes, starts, MPI_ORDER_C,
-                            MPI_LMP_TAGINT, &columnunsized);
-  MPI_Type_create_resized (columnunsized, 0, sizeof(tagint), &column);
+                            MPI_DOUBLE, &columnunsized);
+  MPI_Type_create_resized (columnunsized, 0, sizeof(double), &column);
   MPI_Type_commit(&column);
 
   memory->destroy(global_mega_glove);
-  memory->create(global_mega_glove,max_natoms+1,global_megasize,"bond/react:global_mega_glove");
+  memory->create(global_mega_glove,max_natoms+cuff,global_megasize,"bond/react:global_mega_glove");
 
-  for (int i = 0; i < max_natoms+1; i++)
+  for (int i = 0; i < max_natoms+cuff; i++)
     for (int j = 0; j < global_megasize; j++)
       global_mega_glove[i][j] = 0;
 
   if (ghostly_num_mega > 0) {
-    for (int i = 0; i < max_natoms+1; i++) {
+    for (int i = 0; i < max_natoms+cuff; i++) {
       for (int j = 0; j < ghostly_num_mega; j++) {
         global_mega_glove[i][j+start] = ghostly_mega_glove[i][j];
       }
@@ -3006,7 +3052,12 @@ void FixBondReact::update_everything()
 
   int update_num_mega;
   tagint **update_mega_glove;
-  memory->create(update_mega_glove,max_natoms+1,MAX(local_num_mega,global_megasize),"bond/react:update_mega_glove");
+  // for now, keeping rxnID in update_mega_glove, but not rest of cuff in update_mega_glove
+  int maxmega = MAX(local_num_mega,global_megasize);
+  memory->create(update_mega_glove,max_natoms+1,maxmega,"bond/react:update_mega_glove");
+
+  double *sim_total_charges;
+  if (rescale_charges_anyflag) memory->create(sim_total_charges,maxmega,"bond/react:sim_total_charges");
 
   for (int pass = 0; pass < 2; pass++) {
     update_num_mega = 0;
@@ -3014,15 +3065,20 @@ void FixBondReact::update_everything()
     for (int i = 0; i < nreacts; i++) iskip[i] = 0;
     if (pass == 0) {
       for (int i = 0; i < local_num_mega; i++) {
-        rxnID = local_mega_glove[0][i];
+        rxnID = (int) local_mega_glove[0][i];
         // reactions already shuffled from dedup procedure, so can skip first N
         if (iskip[rxnID]++ < nlocalskips[rxnID]) continue;
 
+        // this will be overwritten if reaction skipped by create_atoms below
+        update_mega_glove[0][update_num_mega] = (tagint) local_mega_glove[0][i];
+        for (int j = 0; j < max_natoms; j++)
+          update_mega_glove[j+1][update_num_mega] = (tagint) local_mega_glove[j+cuff][i];
+
         // atoms inserted here for serial MPI_STUBS build only
         if (create_atoms_flag[rxnID] == 1) {
           onemol = atom->molecules[unreacted_mol[rxnID]];
           twomol = atom->molecules[reacted_mol[rxnID]];
-          if (insert_atoms(local_mega_glove,i)) {
+          if (insert_atoms(update_mega_glove,update_num_mega)) {
             inserted_atoms_flag = 1;
           } else { // create aborted
             reaction_count_total[rxnID]--;
@@ -3030,23 +3086,27 @@ void FixBondReact::update_everything()
           }
         }
 
-        for (int j = 0; j < max_natoms+1; j++)
-          update_mega_glove[j][update_num_mega] = local_mega_glove[j][i];
+        if (rescale_charges_flag[rxnID]) sim_total_charges[update_num_mega] = local_mega_glove[1][i];
         update_num_mega++;
       }
     } else if (pass == 1) {
       for (int i = 0; i < global_megasize; i++) {
-        rxnID = global_mega_glove[0][i];
+        rxnID = (int) global_mega_glove[0][i];
         // reactions already shuffled from dedup procedure, so can skip first N
         if (iskip[rxnID]++ < nghostlyskips[rxnID]) continue;
 
+        // this will be overwritten if reaction skipped by create_atoms below
+        update_mega_glove[0][update_num_mega] = (tagint) global_mega_glove[0][i];
+        for (int j = 0; j < max_natoms; j++)
+          update_mega_glove[j+1][update_num_mega] = (tagint) global_mega_glove[j+cuff][i];
+
         // we can insert atoms here, now that reactions are finalized
         // can't do it any earlier, due to skipped reactions (max_rxn)
         // for MPI build, reactions that create atoms are always treated as 'global'
         if (create_atoms_flag[rxnID] == 1) {
           onemol = atom->molecules[unreacted_mol[rxnID]];
           twomol = atom->molecules[reacted_mol[rxnID]];
-          if (insert_atoms(global_mega_glove,i)) {
+          if (insert_atoms(update_mega_glove,update_num_mega)) {
             inserted_atoms_flag = 1;
           } else { // create aborted
             reaction_count_total[rxnID]--;
@@ -3054,8 +3114,7 @@ void FixBondReact::update_everything()
           }
         }
 
-        for (int j = 0; j < max_natoms+1; j++)
-          update_mega_glove[j][update_num_mega] = global_mega_glove[j][i];
+        if (rescale_charges_flag[rxnID]) sim_total_charges[update_num_mega] = global_mega_glove[1][i];
         update_num_mega++;
       }
     }
@@ -3090,38 +3149,18 @@ void FixBondReact::update_everything()
       }
     }
 
-    // get charge rescale delta
-    double charge_rescale_addend = 0;
-    if (rescale_charges_flag[rxnID] == 1) {
-      double sim_total_charge = 0;
-      double mol_total_charge = 0;
-      int n_custom_charge = 0;
-      for (int i = 0; i < update_num_mega; i++) {
-        rxnID = update_mega_glove[0][i];
-        twomol = atom->molecules[reacted_mol[rxnID]];
-        for (int j = 0; j < twomol->natoms; j++) {
-          int jj = equivalences[j][1][rxnID]-1;
-          if (atom->map(update_mega_glove[jj+1][i]) >= 0 &&
-              atom->map(update_mega_glove[jj+1][i]) < nlocal) {
-            if (landlocked_atoms[j][rxnID] == 1)
-              type[atom->map(update_mega_glove[jj+1][i])] = twomol->type[j];
-            if (twomol->qflag && atom->q_flag && custom_charges[jj][rxnID] == 1) {
-              double *q = atom->q;
-              sim_total_charge += q[atom->map(update_mega_glove[jj+1][i])];
-              mol_total_charge += twomol->q[j];
-              n_custom_charge++;
-            }
-          }
-        }
-      }
-      charge_rescale_addend = (sim_total_charge-mol_total_charge)/n_custom_charge;
-    }
-
     // update charges and types of landlocked atoms
     // also keep track of 'stabilization' groups here
+    int n_custom_charge;
+    double charge_rescale_addend;
     for (int i = 0; i < update_num_mega; i++) {
+      charge_rescale_addend = 0;
       rxnID = update_mega_glove[0][i];
       twomol = atom->molecules[reacted_mol[rxnID]];
+      if (rescale_charges_flag[rxnID]) {
+        n_custom_charge = rescale_charges_flag[rxnID];
+        charge_rescale_addend = (sim_total_charges[i]-mol_total_charge[rxnID])/n_custom_charge;
+      }
       for (int j = 0; j < twomol->natoms; j++) {
         int jj = equivalences[j][1][rxnID]-1;
         int ilocal = atom->map(update_mega_glove[jj+1][i]);
@@ -3555,6 +3594,7 @@ void FixBondReact::update_everything()
   }
 
   memory->destroy(update_mega_glove);
+  if (rescale_charges_anyflag) memory->destroy(sim_total_charges);
 
   // delete atoms. taken from fix_evaporate. but don't think it needs to be in pre_exchange
   // loop in reverse order to avoid copying marked atoms
@@ -3643,7 +3683,7 @@ void FixBondReact::update_everything()
 insert created atoms
 ------------------------------------------------------------------------- */
 
-int FixBondReact::insert_atoms(tagint **my_mega_glove, int iupdate)
+int FixBondReact::insert_atoms(tagint **my_update_mega_glove, int iupdate)
 {
   // inserting atoms based off fix_deposit->pre_exchange
   int flag;
@@ -3692,14 +3732,15 @@ int FixBondReact::insert_atoms(tagint **my_mega_glove, int iupdate)
       n2superpose++;
   }
 
-  int ifit = atom->map(my_mega_glove[ibonding[rxnID]+1][iupdate]); // use this local ID to find fitting proc
+  int ifit = atom->map(my_update_mega_glove[ibonding[rxnID]+1][iupdate]); // use this local ID to find fitting proc
   Superpose3D<double, double **> superposer(n2superpose);
   int fitroot = 0;
   if (ifit >= 0 && ifit < atom->nlocal) {
     fitroot = comm->me;
 
     // get 'temperatere' averaged over site, used for created atoms' vels
-    t = get_temperature(my_mega_glove,1,iupdate);
+    // note: row_offset for my_update_mega_glove is unity, not 'cuff'
+    t = get_temperature(my_update_mega_glove,1,iupdate);
 
     double **xfrozen; // coordinates for the "frozen" target molecule
     double **xmobile; // coordinates for the "mobile" molecule
@@ -3713,12 +3754,12 @@ int FixBondReact::insert_atoms(tagint **my_mega_glove, int iupdate)
         if (!twomol->fragmentmask[modify_create_fragid[rxnID]][j]) continue;
       int ipre = equivalences[j][1][rxnID]-1; // equiv pre-reaction template index
       if (!create_atoms[j][rxnID] && !delete_atoms[ipre][rxnID]) {
-        if (atom->map(my_mega_glove[ipre+1][iupdate]) < 0) {
+        if (atom->map(my_update_mega_glove[ipre+1][iupdate]) < 0) {
           error->warning(FLERR," eligible atoms skipped for created-atoms fit on rank {}\n",
                          comm->me);
           continue;
         }
-        iatom = atom->map(my_mega_glove[ipre+1][iupdate]);
+        iatom = atom->map(my_update_mega_glove[ipre+1][iupdate]);
         if (iref == -1) iref = iatom;
         iatom = domain->closest_image(iref,iatom);
         for (int k = 0; k < 3; k++) {
@@ -3838,7 +3879,7 @@ int FixBondReact::insert_atoms(tagint **my_mega_glove, int iupdate)
         atom->tag[n] = maxtag_all + add_count;
 
         // locally update mega_glove
-        my_mega_glove[preID][iupdate] = atom->tag[n];
+        my_update_mega_glove[preID][iupdate] = atom->tag[n];
 
         if (atom->molecule_flag) {
           if (twomol->moleculeflag) {
@@ -3866,7 +3907,7 @@ int FixBondReact::insert_atoms(tagint **my_mega_glove, int iupdate)
       }
       // globally update mega_glove and equivalences
       MPI_Allreduce(MPI_IN_PLACE,&root,1,MPI_INT,MPI_SUM,world);
-      MPI_Bcast(&my_mega_glove[preID][iupdate],1,MPI_LMP_TAGINT,root,world);
+      MPI_Bcast(&my_update_mega_glove[preID][iupdate],1,MPI_LMP_TAGINT,root,world);
       equivalences[m][0][rxnID] = m+1;
       equivalences[m][1][rxnID] = preID;
       reverse_equiv[preID-1][0][rxnID] = preID;
diff --git a/src/REACTION/fix_bond_react.h b/src/REACTION/fix_bond_react.h
index 66a5e4d6a0..534261e11d 100644
--- a/src/REACTION/fix_bond_react.h
+++ b/src/REACTION/fix_bond_react.h
@@ -71,7 +71,9 @@ class FixBondReact : public Fix {
   int **store_rxn_count;
   int *stabilize_steps_flag;
   int *custom_charges_fragid;
-  int *rescale_charges_flag;
+  int *rescale_charges_flag;   // if nonzero, indicates number of atoms whose charges are updated
+  int rescale_charges_anyflag; // indicates if any reactions do charge rescaling
+  double *mol_total_charge;    // sum of charges of post-reaction atoms whose charges are updated
   int *create_atoms_flag;
   int *modify_create_fragid;
   double *overlapsq;
@@ -156,10 +158,12 @@ class FixBondReact : public Fix {
   int lcl_inst;              // reaction instance
   tagint **glove;            // 1st colmn: pre-reacted template, 2nd colmn: global IDs
   // for all mega_gloves: first row is the ID of bond/react
-  tagint **my_mega_glove;         // local + ghostly reaction instances
-  tagint **local_mega_glove;      // consolidation of local reaction instances
-  tagint **ghostly_mega_glove;    // consolidation of nonlocal reaction instances
-  tagint **global_mega_glove;     // consolidation (inter-processor) of gloves
+  // 'cuff' leaves room for additional values carried around
+  int cuff;                       // default = 1, w/ rescale_charges_flag = 2
+  double **my_mega_glove;         // local + ghostly reaction instances
+  double **local_mega_glove;      // consolidation of local reaction instances
+  double **ghostly_mega_glove;    // consolidation of nonlocal reaction instances
+  double **global_mega_glove;     // consolidation (inter-processor) of gloves
                                   // containing nonlocal atoms
 
   int *localsendlist;      // indicates ghosts of other procs
@@ -191,6 +195,7 @@ class FixBondReact : public Fix {
   int check_constraints();
   void get_IDcoords(int, int, double *);
   double get_temperature(tagint **, int, int);
+  double get_totalcharge();
   void customvarnames();    // get per-atom variables names used by custom constraint
   void get_customvars();    // evaluate local values for variables names used by custom constraint
   double custom_constraint(const std::string &);    // evaulate expression for custom constraint
diff --git a/src/library.cpp b/src/library.cpp
index 0fcedfc20b..1996ebbcb4 100644
--- a/src/library.cpp
+++ b/src/library.cpp
@@ -794,7 +794,7 @@ argument string.
      - yes
    * - type
      - data type of thermo output column; see :cpp:enum:`_LMP_DATATYPE_CONST`
-     - pointer to static int
+     - pointer to int
      - yes
    * - data
      - actual field data for column
@@ -815,7 +815,6 @@ void *lammps_last_thermo(void *handle, const char *what, int index)
   Thermo *th = lmp->output->thermo;
   if (!th) return nullptr;
   const int nfield = *th->get_nfield();
-  static int datatype;
 
   BEGIN_CAPTURE
   {
@@ -833,25 +832,15 @@ void *lammps_last_thermo(void *handle, const char *what, int index)
     } else if (strcmp(what, "type") == 0) {
       if ((index < 0) || (index >= nfield)) return nullptr;
       const auto &field = th->get_fields()[index];
-      if (field.type == multitype::INT) {
-        datatype = LAMMPS_INT;
-        val = (void *) &datatype;
-      } else if (field.type == multitype::BIGINT) {
-        datatype = LAMMPS_INT64;
-        val = (void *) &datatype;
-      } else if (field.type == multitype::DOUBLE) {
-        datatype = LAMMPS_DOUBLE;
-        val = (void *) &datatype;
-      }
-
+      val = (void *) &field.type;
     } else if (strcmp(what, "data") == 0) {
       if ((index < 0) || (index >= nfield)) return nullptr;
       const auto &field = th->get_fields()[index];
-      if (field.type == multitype::INT) {
+      if (field.type == multitype::LAMMPS_INT) {
         val = (void *) &field.data.i;
-      } else if (field.type == multitype::BIGINT) {
+      } else if (field.type == multitype::LAMMPS_INT64) {
         val = (void *) &field.data.b;
-      } else if (field.type == multitype::DOUBLE) {
+      } else if (field.type == multitype::LAMMPS_DOUBLE) {
         val = (void *) &field.data.d;
       }
 
@@ -1351,6 +1340,13 @@ int lammps_extract_global_datatype(void * /*handle*/, const char *name)
   if (strcmp(name,"q_flag") == 0) return LAMMPS_INT;
 
   if (strcmp(name,"units") == 0) return LAMMPS_STRING;
+  if (strcmp(name,"atom_style") == 0) return LAMMPS_STRING;
+  if (strcmp(name,"pair_style") == 0) return LAMMPS_STRING;
+  if (strcmp(name,"bond_style") == 0) return LAMMPS_STRING;
+  if (strcmp(name,"angle_style") == 0) return LAMMPS_STRING;
+  if (strcmp(name,"dihedral_style") == 0) return LAMMPS_STRING;
+  if (strcmp(name,"improper_style") == 0) return LAMMPS_STRING;
+  if (strcmp(name,"kspace_style") == 0) return LAMMPS_STRING;
   if (strcmp(name,"boltz") == 0) return LAMMPS_DOUBLE;
   if (strcmp(name,"hplanck") == 0) return LAMMPS_DOUBLE;
   if (strcmp(name,"mvv2e") == 0) return LAMMPS_DOUBLE;
@@ -1597,6 +1593,34 @@ report the "native" data type.  The following tables are provided:
      - int
      - 1
      - **deprecated**. Use :cpp:func:`lammps_extract_setting` instead.
+   * - atom_style
+     - char \*
+     - 1
+     - string with the current atom style.
+   * - pair_style
+     - char \*
+     - 1
+     - string with the current pair style.
+   * - bond_style
+     - char \*
+     - 1
+     - string with the current bond style.
+   * - angle_style
+     - char \*
+     - 1
+     - string with the current angle style.
+   * - dihedral_style
+     - char \*
+     - 1
+     - string with the current dihedral style.
+   * - improper_style
+     - char \*
+     - 1
+     - string with the current improper style.
+   * - kspace_style
+     - char \*
+     - 1
+     - string with the current KSpace style.
 
 .. _extract_unit_settings:
 
@@ -1703,6 +1727,13 @@ void *lammps_extract_global(void *handle, const char *name)
   auto lmp = (LAMMPS *) handle;
 
   if (strcmp(name,"units") == 0) return (void *) lmp->update->unit_style;
+  if (strcmp(name,"atom_style") == 0) return (void *) lmp->atom->atom_style;
+  if (strcmp(name,"pair_style") == 0) return (void *) lmp->force->pair_style;
+  if (strcmp(name,"bond_style") == 0) return (void *) lmp->force->bond_style;
+  if (strcmp(name,"angle_style") == 0) return (void *) lmp->force->angle_style;
+  if (strcmp(name,"dihedral_style") == 0) return (void *) lmp->force->dihedral_style;
+  if (strcmp(name,"improper_style") == 0) return (void *) lmp->force->improper_style;
+  if (strcmp(name,"kspace_style") == 0) return (void *) lmp->force->kspace_style;
   if (strcmp(name,"dt") == 0) return (void *) &lmp->update->dt;
   if (strcmp(name,"ntimestep") == 0) return (void *) &lmp->update->ntimestep;
   // update->atime can be referenced as a pointer
diff --git a/src/library.h b/src/library.h
index 5f46e36ccf..30a12ebdef 100644
--- a/src/library.h
+++ b/src/library.h
@@ -41,10 +41,11 @@
 /** Data type constants for extracting data from atoms, computes and fixes
  *
  * Must be kept in sync with the equivalent constants in ``python/lammps/constants.py``,
- * ``fortran/lammps.f90``, ``tools/swig/lammps.i``, and
+ * ``fortran/lammps.f90``, ``tools/swig/lammps.i``, ``src/lmptype.h``, and
  *``examples/COUPLE/plugin/liblammpsplugin.h`` */
 
 enum _LMP_DATATYPE_CONST {
+  LAMMPS_NONE = -1,     /*!< no data type assigned (yet) */
   LAMMPS_INT = 0,       /*!< 32-bit integer (array) */
   LAMMPS_INT_2D = 1,    /*!< two-dimensional 32-bit integer array */
   LAMMPS_DOUBLE = 2,    /*!< 64-bit double (array) */
diff --git a/src/lmptype.h b/src/lmptype.h
index 6e0b54d988..2089350f48 100644
--- a/src/lmptype.h
+++ b/src/lmptype.h
@@ -276,7 +276,21 @@ union ubuf {
 \endverbatim
   */
 struct multitype {
-  enum { NONE, DOUBLE, INT, BIGINT };
+  /** Data type constants for extracting data from atoms, computes and fixes
+   *
+   * This enum must be kept in sync with the corresponding enum or constants
+   * in ``python/lammps/constants.py``, ``fortran/lammps.f90``, ``tools/swig/lammps.i``,
+   * ``src/library.h``, and ``examples/COUPLE/plugin/liblammpsplugin.h`` */
+  enum _LMP_DATATYPE_CONST {
+    LAMMPS_NONE = -1,     /*!< no data type assigned (yet) */
+    LAMMPS_INT = 0,       /*!< 32-bit integer (array) */
+    LAMMPS_INT_2D = 1,    /*!< two-dimensional 32-bit integer array */
+    LAMMPS_DOUBLE = 2,    /*!< 64-bit double (array) */
+    LAMMPS_DOUBLE_2D = 3, /*!< two-dimensional 64-bit double array */
+    LAMMPS_INT64 = 4,     /*!< 64-bit integer (array) */
+    LAMMPS_INT64_2D = 5,  /*!< two-dimensional 64-bit integer array */
+    LAMMPS_STRING = 6     /*!< C-String */
+  };
 
   int type;
   union {
@@ -285,26 +299,26 @@ struct multitype {
     int64_t b;
   } data;
 
-  multitype() : type(NONE) { data.d = 0.0; }
+  multitype() : type(LAMMPS_NONE) { data.d = 0.0; }
   multitype(const multitype &) = default;
   multitype(multitype &&) = default;
   ~multitype() = default;
 
   multitype &operator=(const double &_d)
   {
-    type = DOUBLE;
+    type = LAMMPS_DOUBLE;
     data.d = _d;
     return *this;
   }
   multitype &operator=(const int &_i)
   {
-    type = INT;
+    type = LAMMPS_INT;
     data.i = _i;
     return *this;
   }
   multitype &operator=(const int64_t &_b)
   {
-    type = BIGINT;
+    type = LAMMPS_INT64;
     data.b = _b;
     return *this;
   }
diff --git a/tools/swig/lammps.i b/tools/swig/lammps.i
index c4ef0a7109..91a6866107 100644
--- a/tools/swig/lammps.i
+++ b/tools/swig/lammps.i
@@ -28,9 +28,10 @@
  *
  * Must be kept in sync with the equivalent constants in ``src/library.h``,
  * ``python/lammps/constants.py``, ``examples/COUPLE/plugin/liblammpsplugin.h``,
- * and ``fortran/lammps.f90`` */
+ * ``src/lmptype.h``, and ``fortran/lammps.f90`` */
 
 enum _LMP_DATATYPE_CONST {
+  LAMMPS_NONE      =-1,     /*!< no data type assigned (yet) */
   LAMMPS_INT       = 0,     /*!< 32-bit integer (array) */
   LAMMPS_INT_2D    = 1,     /*!< two-dimensional 32-bit integer array */
   LAMMPS_DOUBLE    = 2,     /*!< 64-bit double (array) */
diff --git a/unittest/fortran/test_fortran_get_thermo.f90 b/unittest/fortran/test_fortran_get_thermo.f90
index 7911ab07d5..965e6d58f7 100644
--- a/unittest/fortran/test_fortran_get_thermo.f90
+++ b/unittest/fortran/test_fortran_get_thermo.f90
@@ -152,3 +152,111 @@ FUNCTION f_lammps_get_thermo_zhi() BIND(C)
 
    f_lammps_get_thermo_zhi = lmp%get_thermo('zhi')
 END FUNCTION f_lammps_get_thermo_zhi
+
+SUBROUTINE f_lammps_last_thermo_setup() BIND(C)
+  USE liblammps
+  USE keepstuff, ONLY : lmp, big_input, cont_input, pair_input
+  IMPLICIT NONE
+
+  CALL lmp%commands_list(big_input)
+  CALL lmp%commands_list(cont_input)
+  CALL lmp%commands_list(pair_input)
+  CALL lmp%command('thermo 10')
+  CALL lmp%command('run 15 post no')
+END SUBROUTINE f_lammps_last_thermo_setup
+
+FUNCTION f_lammps_last_thermo_step() BIND(C)
+  USE, INTRINSIC :: ISO_C_BINDING, ONLY: c_int, c_int64_t
+  USE liblammps
+  USE keepstuff, ONLY : lmp
+  IMPLICIT NONE
+  INTEGER(c_int) :: f_lammps_last_thermo_step
+  INTEGER :: size_bigint
+  INTEGER(c_int), POINTER :: ival
+  INTEGER(c_int64_t), POINTER :: bval
+
+  size_bigint = lmp%extract_setting('bigint')
+  IF (size_bigint == 4) THEN
+      ival = lmp%last_thermo('step',1)
+      f_lammps_last_thermo_step = INT(ival)
+  ELSE
+      bval = lmp%last_thermo('step',1)
+      f_lammps_last_thermo_step = INT(bval)
+  END IF
+END FUNCTION f_lammps_last_thermo_step
+
+FUNCTION f_lammps_last_thermo_num() BIND(C)
+  USE, INTRINSIC :: ISO_C_BINDING, ONLY: c_int
+  USE liblammps
+  USE keepstuff, ONLY : lmp
+  IMPLICIT NONE
+  INTEGER(c_int) :: f_lammps_last_thermo_num
+  INTEGER(c_int), POINTER :: ival
+
+  ival = lmp%last_thermo('num',1)
+  f_lammps_last_thermo_num = ival
+END FUNCTION f_lammps_last_thermo_num
+
+FUNCTION f_lammps_last_thermo_type(idx) BIND(C)
+  USE, INTRINSIC :: ISO_C_BINDING, ONLY: c_int
+  USE liblammps
+  USE keepstuff, ONLY : lmp
+  IMPLICIT NONE
+  INTEGER(c_int), VALUE :: idx
+  INTEGER(c_int) :: f_lammps_last_thermo_type
+  INTEGER(c_int), POINTER :: ival
+
+  ival = lmp%last_thermo('type',idx)
+  f_lammps_last_thermo_type = ival
+END FUNCTION f_lammps_last_thermo_type
+
+FUNCTION f_lammps_last_thermo_string(idx) BIND(C)
+  USE, INTRINSIC :: ISO_C_BINDING, ONLY: c_int, c_ptr, c_null_ptr
+  USE liblammps
+  USE keepstuff, ONLY : lmp, f2c_string
+  IMPLICIT NONE
+  INTEGER(c_int), VALUE :: idx
+  TYPE(c_ptr) :: f_lammps_last_thermo_string
+  CHARACTER(LEN=12) :: buffer
+
+  buffer = lmp%last_thermo('keyword',idx)
+  IF (LEN_TRIM(buffer) > 0) THEN
+      f_lammps_last_thermo_string = f2c_string(buffer)
+  ELSE
+      f_lammps_last_thermo_string = c_null_ptr
+  END IF
+END FUNCTION f_lammps_last_thermo_string
+
+FUNCTION f_lammps_last_thermo_int(idx) BIND(C)
+  USE, INTRINSIC :: ISO_C_BINDING, ONLY: c_int, c_int64_t
+  USE liblammps
+  USE keepstuff, ONLY : lmp
+  IMPLICIT NONE
+  INTEGER(c_int), VALUE :: idx
+  INTEGER(c_int), POINTER :: ival
+  INTEGER(c_int64_t), POINTER :: bval
+  INTEGER(c_int) :: f_lammps_last_thermo_int
+  INTEGER :: size_bigint
+
+  size_bigint = lmp%extract_setting('bigint')
+  IF (size_bigint == 4) THEN
+      ival = lmp%last_thermo('data',idx)
+      f_lammps_last_thermo_int = INT(ival)
+  ELSE
+      bval = lmp%last_thermo('data',idx)
+      f_lammps_last_thermo_int = INT(bval)
+  END IF
+END FUNCTION f_lammps_last_thermo_int
+
+FUNCTION f_lammps_last_thermo_double(idx) BIND(C)
+  USE, INTRINSIC :: ISO_C_BINDING, ONLY: c_int, c_double
+  USE liblammps
+  USE keepstuff, ONLY : lmp
+  IMPLICIT NONE
+  INTEGER(c_int), VALUE :: idx
+  REAL(c_double), POINTER :: dval
+  REAL(c_double) :: f_lammps_last_thermo_double
+
+  dval = lmp%last_thermo('data',idx)
+  f_lammps_last_thermo_double = dval
+END FUNCTION f_lammps_last_thermo_double
diff --git a/unittest/fortran/wrap_get_thermo.cpp b/unittest/fortran/wrap_get_thermo.cpp
index 71b51609eb..e18697ba9b 100644
--- a/unittest/fortran/wrap_get_thermo.cpp
+++ b/unittest/fortran/wrap_get_thermo.cpp
@@ -1,6 +1,7 @@
 // unit tests for getting thermodynamic output from a LAMMPS instance through the Fortran wrapper
 
 #include "lammps.h"
+#include "lmptype.h"
 #include <mpi.h>
 #include <string>
 
@@ -23,8 +24,18 @@ double f_lammps_get_thermo_ylo();
 double f_lammps_get_thermo_yhi();
 double f_lammps_get_thermo_zlo();
 double f_lammps_get_thermo_zhi();
+
+void f_lammps_last_thermo_setup();
+int f_lammps_last_thermo_step();
+int f_lammps_last_thermo_num();
+int f_lammps_last_thermo_type(int);
+const char *f_lammps_last_thermo_string(int);
+int f_lammps_last_thermo_int(int);
+double f_lammps_last_thermo_double(int);
 }
 
+using LAMMPS_NS::multitype;
+
 class LAMMPS_thermo : public ::testing::Test {
 protected:
     LAMMPS_NS::LAMMPS *lmp;
@@ -65,3 +76,33 @@ TEST_F(LAMMPS_thermo, get_thermo)
     EXPECT_DOUBLE_EQ(f_lammps_get_thermo_zlo(), 0.0);
     EXPECT_DOUBLE_EQ(f_lammps_get_thermo_zhi(), 4.0);
 };
+
+TEST_F(LAMMPS_thermo, last_thermo)
+{
+    EXPECT_EQ(f_lammps_last_thermo_step(), -1);
+    EXPECT_EQ(f_lammps_last_thermo_type(1), multitype::LAMMPS_NONE);
+    EXPECT_EQ(f_lammps_last_thermo_type(2), multitype::LAMMPS_NONE);
+    f_lammps_last_thermo_setup();
+    EXPECT_EQ(f_lammps_last_thermo_step(), 15);
+    EXPECT_EQ(f_lammps_last_thermo_num(), 6);
+    EXPECT_STREQ(f_lammps_last_thermo_string(1), "Step");
+    EXPECT_STREQ(f_lammps_last_thermo_string(2), "Temp");
+    EXPECT_STREQ(f_lammps_last_thermo_string(3), "E_pair");
+    EXPECT_STREQ(f_lammps_last_thermo_string(6), "Press");
+#if defined(LAMMPS_SMALLSMALL)
+    EXPECT_EQ(f_lammps_last_thermo_type(1), multitype::LAMMPS_INT);
+#else
+    EXPECT_EQ(f_lammps_last_thermo_type(1), multitype::LAMMPS_INT64);
+#endif
+    EXPECT_EQ(f_lammps_last_thermo_int(1), 15);
+    EXPECT_EQ(f_lammps_last_thermo_type(2), multitype::LAMMPS_DOUBLE);
+    EXPECT_EQ(f_lammps_last_thermo_type(3), multitype::LAMMPS_DOUBLE);
+    EXPECT_EQ(f_lammps_last_thermo_type(4), multitype::LAMMPS_DOUBLE);
+    EXPECT_EQ(f_lammps_last_thermo_type(5), multitype::LAMMPS_DOUBLE);
+    EXPECT_EQ(f_lammps_last_thermo_type(6), multitype::LAMMPS_DOUBLE);
+    EXPECT_DOUBLE_EQ(f_lammps_last_thermo_double(2), 0.0);
+    EXPECT_DOUBLE_EQ(f_lammps_last_thermo_double(3), -0.13713425198078993);
+    EXPECT_DOUBLE_EQ(f_lammps_last_thermo_double(4), 0.0);
+    EXPECT_DOUBLE_EQ(f_lammps_last_thermo_double(5), -0.13713425198078993);
+    EXPECT_DOUBLE_EQ(f_lammps_last_thermo_double(6), -0.022421073321023492);
+};
diff --git a/unittest/python/python-pylammps.py b/unittest/python/python-pylammps.py
index 2b92f82248..9e691b1b8c 100644
--- a/unittest/python/python-pylammps.py
+++ b/unittest/python/python-pylammps.py
@@ -82,14 +82,26 @@ class PythonPyLammps(unittest.TestCase):
         self.pylmp.variable("fx atom fx")
         self.pylmp.run(10)
 
-        self.assertEqual(len(self.pylmp.runs), 1)
-        self.assertEqual(self.pylmp.last_run, self.pylmp.runs[0])
-        self.assertEqual(len(self.pylmp.last_run.thermo.Step), 2)
-        self.assertEqual(len(self.pylmp.last_run.thermo.Temp), 2)
-        self.assertEqual(len(self.pylmp.last_run.thermo.E_pair), 2)
-        self.assertEqual(len(self.pylmp.last_run.thermo.E_mol), 2)
-        self.assertEqual(len(self.pylmp.last_run.thermo.TotEng), 2)
-        self.assertEqual(len(self.pylmp.last_run.thermo.Press), 2)
+        # thermo data is only captured during a run if PYTHON package is enabled
+        # without it, it will only capture the final thermo at completion
+        if self.pylmp.lmp.has_package("PYTHON"):
+            self.assertEqual(len(self.pylmp.runs), 1)
+            self.assertEqual(self.pylmp.last_run, self.pylmp.runs[0])
+            self.assertEqual(len(self.pylmp.last_run.thermo.Step), 2)
+            self.assertEqual(len(self.pylmp.last_run.thermo.Temp), 2)
+            self.assertEqual(len(self.pylmp.last_run.thermo.E_pair), 2)
+            self.assertEqual(len(self.pylmp.last_run.thermo.E_mol), 2)
+            self.assertEqual(len(self.pylmp.last_run.thermo.TotEng), 2)
+            self.assertEqual(len(self.pylmp.last_run.thermo.Press), 2)
+        else:
+            self.assertEqual(len(self.pylmp.runs), 1)
+            self.assertEqual(self.pylmp.last_run, self.pylmp.runs[0])
+            self.assertEqual(len(self.pylmp.last_run.thermo.Step), 1)
+            self.assertEqual(len(self.pylmp.last_run.thermo.Temp), 1)
+            self.assertEqual(len(self.pylmp.last_run.thermo.E_pair), 1)
+            self.assertEqual(len(self.pylmp.last_run.thermo.E_mol), 1)
+            self.assertEqual(len(self.pylmp.last_run.thermo.TotEng), 1)
+            self.assertEqual(len(self.pylmp.last_run.thermo.Press), 1)
 
     def test_info_queries(self):
         self.pylmp.lattice("fcc", 0.8442),
diff --git a/unittest/utils/test_lmptype.cpp b/unittest/utils/test_lmptype.cpp
index e5dc871953..383c9d4b2c 100644
--- a/unittest/utils/test_lmptype.cpp
+++ b/unittest/utils/test_lmptype.cpp
@@ -54,18 +54,18 @@ TEST(Types, multitype)
     m[4] = -1023;
     m[5] = -2.225;
 
-    EXPECT_EQ(m[0].type, multitype::BIGINT);
-    EXPECT_EQ(m[1].type, multitype::INT);
-    EXPECT_EQ(m[2].type, multitype::DOUBLE);
+    EXPECT_EQ(m[0].type, multitype::LAMMPS_INT64);
+    EXPECT_EQ(m[1].type, multitype::LAMMPS_INT);
+    EXPECT_EQ(m[2].type, multitype::LAMMPS_DOUBLE);
 
 #if defined(LAMMPS_SMALLSMALL)
-    EXPECT_EQ(m[3].type, multitype::INT);
+    EXPECT_EQ(m[3].type, multitype::LAMMPS_INT);
 #else
-    EXPECT_EQ(m[3].type, multitype::BIGINT);
+    EXPECT_EQ(m[3].type, multitype::LAMMPS_INT64);
 #endif
-    EXPECT_EQ(m[4].type, multitype::INT);
-    EXPECT_EQ(m[5].type, multitype::DOUBLE);
-    EXPECT_EQ(m[6].type, multitype::NONE);
+    EXPECT_EQ(m[4].type, multitype::LAMMPS_INT);
+    EXPECT_EQ(m[5].type, multitype::LAMMPS_DOUBLE);
+    EXPECT_EQ(m[6].type, multitype::LAMMPS_NONE);
 
     EXPECT_EQ(m[0].data.b, b1);
     EXPECT_EQ(m[1].data.i, i1);