diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake
index a0b872ba85..ce55c83b08 100644
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@@ -16,11 +16,6 @@ endif()
 if(Kokkos_ENABLE_OPENMP)
   if(NOT BUILD_OMP)
     message(FATAL_ERROR "Must enable BUILD_OMP with Kokkos_ENABLE_OPENMP")
-  else()
-    # NVHPC/(AMD)Clang does not seem to provide a detectable OpenMP version, but is far beyond version 3.1
-    if((OpenMP_CXX_VERSION VERSION_LESS 3.1) AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "NVHPC") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
-      message(FATAL_ERROR "Compiler must support OpenMP 3.1 or later with Kokkos_ENABLE_OPENMP")
-    endif()
   endif()
 endif()
 ########################################################################
diff --git a/doc/src/Commands_bond.rst b/doc/src/Commands_bond.rst
index aaf706b5df..ef36b6b7c4 100644
--- a/doc/src/Commands_bond.rst
+++ b/doc/src/Commands_bond.rst
@@ -124,7 +124,7 @@ OPT.
    *
    *
    * :doc:`charmm (iko) <dihedral_charmm>`
-   * :doc:`charmmfsw <dihedral_charmm>`
+   * :doc:`charmmfsw (k) <dihedral_charmm>`
    * :doc:`class2 (ko) <dihedral_class2>`
    * :doc:`cosine/shift/exp (o) <dihedral_cosine_shift_exp>`
    * :doc:`fourier (io) <dihedral_fourier>`
diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst
index e7761e7bee..9f2bdbce79 100644
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@@ -146,7 +146,7 @@ OPT.
    * :doc:`lj/charmm/coul/long/soft (o) <pair_fep_soft>`
    * :doc:`lj/charmm/coul/msm (o) <pair_charmm>`
    * :doc:`lj/charmmfsw/coul/charmmfsh <pair_charmm>`
-   * :doc:`lj/charmmfsw/coul/long <pair_charmm>`
+   * :doc:`lj/charmmfsw/coul/long (k) <pair_charmm>`
    * :doc:`lj/class2 (gko) <pair_class2>`
    * :doc:`lj/class2/coul/cut (ko) <pair_class2>`
    * :doc:`lj/class2/coul/cut/soft <pair_fep_soft>`
diff --git a/doc/src/Developer_updating.rst b/doc/src/Developer_updating.rst
index 36c6974b30..cd61eaa5a1 100644
--- a/doc/src/Developer_updating.rst
+++ b/doc/src/Developer_updating.rst
@@ -20,6 +20,7 @@ Available topics in mostly chronological order are:
 - `Use ev_init() to initialize variables derived from eflag and vflag`_
 - `Use utils::numeric() functions instead of force->numeric()`_
 - `Use utils::open_potential() function to open potential files`_
+- `Use symbolic Atom and AtomVec constants instead of numerical values`_
 - `Simplify customized error messages`_
 - `Use of "override" instead of "virtual"`_
 - `Simplified and more compact neighbor list requests`_
@@ -196,6 +197,71 @@ New:
 
    fp = utils::open_potential(filename, lmp);
 
+Use symbolic Atom and AtomVec constants instead of numerical values
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. versionchanged:: 18Sep2020
+
+Properties in LAMMPS that were represented by integer values (0, 1,
+2, 3) to indicate settings in the ``Atom`` and ``AtomVec`` classes (or
+classes derived from it) (and its derived classes) have been converted
+to use scoped enumerators instead.
+
+.. list-table::
+   :header-rows: 1
+   :widths: auto
+
+   * - Symbolic Constant
+     - Value
+     - Symbolic Constant
+     - Value
+   * - Atom::GROW
+     - 0
+     - Atom::MAP_NONE
+     - 0
+   * - Atom::RESTART
+     - 1
+     - Atom::MAP_ARRAY
+     - 1
+   * - Atom::BORDER
+     - 2
+     - Atom::MAP_HASH
+     - 2
+   * - Atom::ATOMIC
+     - 0
+     - Atom::MAP_YES
+     - 3
+   * - Atom::MOLECULAR
+     - 1
+     - AtomVec::PER_ATOM
+     - 0
+   * - Atom::TEMPLATE
+     - 2
+     - AtomVec::PER_TYPE
+     - 1
+
+Old:
+
+.. code-block:: c++
+
+   molecular = 0;
+   mass_type = 1;
+   if (atom->molecular == 2)
+   if (atom->map_style == 2)
+   atom->add_callback(0);
+   atom->delete_callback(id,1);
+
+New:
+
+.. code-block:: c++
+
+   molecular = Atom::ATOMIC;
+   mass_type = AtomVec::PER_TYPE;
+   if (atom->molecular == Atom::TEMPLATE)
+   if (atom->map_style == Atom::MAP_HASH)
+   atom->add_callback(Atom::GROW);
+   atom->delete_callback(id,Atom::RESTART);
+
 Simplify customized error messages
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/doc/src/angle_charmm.rst b/doc/src/angle_charmm.rst
index 425ed7e4f1..655b860a28 100644
--- a/doc/src/angle_charmm.rst
+++ b/doc/src/angle_charmm.rst
@@ -70,7 +70,9 @@ for more info.
 Related commands
 """"""""""""""""
 
-:doc:`angle_coeff <angle_coeff>`
+:doc:`angle_coeff <angle_coeff>`, :doc:`pair_style lj/charmm variants <pair_charmm>`,
+:doc:`dihedral_style charmm <dihedral_charmm>`,
+:doc:`dihedral_style charmmfsw <dihedral_charmm>`, :doc:`fix cmap <fix_cmap>`
 
 Default
 """""""
diff --git a/doc/src/angle_lepton.rst b/doc/src/angle_lepton.rst
index 20fa5b1fee..22873f5765 100644
--- a/doc/src/angle_lepton.rst
+++ b/doc/src/angle_lepton.rst
@@ -11,7 +11,16 @@ Syntax
 
 .. code-block:: LAMMPS
 
-   angle_style lepton
+   angle_style style args
+
+* style = *lepton*
+* args = optional arguments
+
+.. parsed-literal::
+
+   args = *auto_offset* or *no_offset*
+     *auto_offset* = offset the potential energy so that the value at theta0 is 0.0 (default)
+     *no_offset* = do not offset the potential energy
 
 Examples
 """"""""
@@ -19,6 +28,7 @@ Examples
 .. code-block:: LAMMPS
 
    angle_style lepton
+   angle_style lepton no_offset
 
    angle_coeff  1  120.0  "k*theta^2; k=250.0"
    angle_coeff  2   90.0  "k2*theta^2 + k3*theta^3 + k4*theta^4; k2=300.0; k3=-100.0; k4=50.0"
@@ -41,6 +51,13 @@ angle coefficient.  For example `"200.0*theta^2"` represents a
 
    U_{angle,i} = K (\theta_i - \theta_0)^2 = K \theta^2 \qquad \theta = \theta_i - \theta_0
 
+.. versionchanged:: TBD
+
+By default the potential energy U is shifted so that the value U is 0.0
+for $theta = theta_0$.  This is equivalent to using the optional keyword
+*auto_offset*.  When using the keyword *no_offset* instead, the
+potential energy is not shifted.
+
 The `Lepton library <https://simtk.org/projects/lepton>`_, that the
 *lepton* angle style interfaces with, evaluates this expression string
 at run time to compute the pairwise energy.  It also creates an
diff --git a/doc/src/bond_bpm_rotational.rst b/doc/src/bond_bpm_rotational.rst
index 7459d491d6..6734bd7bfe 100644
--- a/doc/src/bond_bpm_rotational.rst
+++ b/doc/src/bond_bpm_rotational.rst
@@ -147,8 +147,8 @@ By default, pair forces are not calculated between bonded particles.
 Pair forces can alternatively be overlaid on top of bond forces by setting
 the *overlay/pair* keyword to *yes*. These settings require specific
 :doc:`special_bonds <special_bonds>` settings described in the
-restrictions.  Further details can be found in the :doc:`how to
-<Howto_bpm>` page on BPMs.
+restrictions.  Further details can be found in the :doc:`how to <Howto_bpm>`
+page on BPMs.
 
 .. versionadded:: 28Mar2023
 
diff --git a/doc/src/bond_bpm_spring.rst b/doc/src/bond_bpm_spring.rst
index 04ff4d5991..a03c832249 100644
--- a/doc/src/bond_bpm_spring.rst
+++ b/doc/src/bond_bpm_spring.rst
@@ -113,8 +113,8 @@ By default, pair forces are not calculated between bonded particles.
 Pair forces can alternatively be overlaid on top of bond forces by setting
 the *overlay/pair* keyword to *yes*. These settings require specific
 :doc:`special_bonds <special_bonds>` settings described in the
-restrictions.  Further details can be found in the :doc:`how to
-<Howto_bpm>` page on BPMs.
+restrictions.  Further details can be found in the :doc:`how to <Howto_bpm>`
+page on BPMs.
 
 .. versionadded:: 28Mar2023
 
diff --git a/doc/src/bond_lepton.rst b/doc/src/bond_lepton.rst
index adfd30627d..9429535af8 100644
--- a/doc/src/bond_lepton.rst
+++ b/doc/src/bond_lepton.rst
@@ -11,7 +11,16 @@ Syntax
 
 .. code-block:: LAMMPS
 
-   bond_style lepton
+   bond_style style args
+
+* style = *lepton*
+* args = optional arguments
+
+.. parsed-literal::
+
+   args = *auto_offset* or *no_offset*
+     *auto_offset* = offset the potential energy so that the value at r0 is 0.0 (default)
+     *no_offset* = do not offset the potential energy
 
 Examples
 """"""""
@@ -19,6 +28,7 @@ Examples
 .. code-block:: LAMMPS
 
    bond_style lepton
+   bond_style lepton no_offset
 
    bond_coeff  1  1.5 "k*r^2; k=250.0"
    bond_coeff  2  1.1 "k2*r^2 + k3*r^3 + k4*r^4; k2=300.0; k3=-100.0; k4=50.0"
@@ -40,6 +50,13 @@ constant *K* of 200.0 energy units:
 
    U_{bond,i} = K (r_i - r_0)^2 = K r^2 \qquad r = r_i - r_0
 
+.. versionchanged:: TBD
+
+By default the potential energy U is shifted so that he value U is 0.0
+for $r = r_0$.  This is equivalent to using the optional keyword
+*auto_offset*.  When using the keyword *no_offset* instead, the
+potential energy is not shifted.
+
 The `Lepton library <https://simtk.org/projects/lepton>`_, that the
 *lepton* bond style interfaces with, evaluates this expression string at
 run time to compute the pairwise energy.  It also creates an analytical
diff --git a/doc/src/dihedral_charmm.rst b/doc/src/dihedral_charmm.rst
index cc792693a2..a5652bc74e 100644
--- a/doc/src/dihedral_charmm.rst
+++ b/doc/src/dihedral_charmm.rst
@@ -3,6 +3,7 @@
 .. index:: dihedral_style charmm/kk
 .. index:: dihedral_style charmm/omp
 .. index:: dihedral_style charmmfsw
+.. index:: dihedral_style charmmfsw/kk
 
 dihedral_style charmm command
 =============================
@@ -12,6 +13,8 @@ Accelerator Variants: *charmm/intel*, *charmm/kk*, *charmm/omp*
 dihedral_style charmmfsw command
 ================================
 
+Accelerator Variants: *charmmfsw/kk*
+
 Syntax
 """"""
 
@@ -144,7 +147,9 @@ for more info.
 Related commands
 """"""""""""""""
 
-:doc:`dihedral_coeff <dihedral_coeff>`
+:doc:`dihedral_coeff <dihedral_coeff>`,
+:doc:`pair_style lj/charmm variants <pair_charmm>`,
+:doc:`angle_style charmm <angle_charmm>`, :doc:`fix cmap <fix_cmap>`
 
 Default
 """""""
diff --git a/doc/src/fix_qeq.rst b/doc/src/fix_qeq.rst
index bace7af0ca..f353e9a998 100644
--- a/doc/src/fix_qeq.rst
+++ b/doc/src/fix_qeq.rst
@@ -232,8 +232,6 @@ These fixes are part of the QEQ package.  They are only enabled if
 LAMMPS was built with that package.  See the :doc:`Build package
 <Build_package>` page for more info.
 
-These qeq fixes are not compatible with the GPU and USER-INTEL packages.
-
 These qeq fixes will ignore electric field contributions from
 :doc:`fix efield <fix_efield>`.
 
diff --git a/doc/src/molecule.rst b/doc/src/molecule.rst
index b930a9fc65..e1770ced2a 100644
--- a/doc/src/molecule.rst
+++ b/doc/src/molecule.rst
@@ -126,14 +126,50 @@ molecule (header keyword = inertia).
 Format of a molecule file
 """""""""""""""""""""""""
 
-The format of an individual molecule file is similar but
-(not identical) to the data file read by the :doc:`read_data <read_data>`
-commands, and is as follows.
+The format of an individual molecule file looks similar but is
+different than that of a data file read by the :doc:`read_data <read_data>`
+commands.  Here is a simple example for a TIP3P water molecule:
+
+.. code-block::
+
+   # Water molecule. TIP3P geometry
+   # header section:
+   3 atoms
+   2 bonds
+   1 angles
+
+   # body section:
+   Coords
+
+   1    0.00000  -0.06556   0.00000
+   2    0.75695   0.52032   0.00000
+   3   -0.75695   0.52032   0.00000
+
+   Types
+
+   1        1   # O
+   2        2   # H
+   3        2   # H
+
+   Charges
+
+   1       -0.834
+   2        0.417
+   3        0.417
+
+   Bonds
+
+   1   1      1      2
+   2   1      1      3
+
+   Angles
+
+   1   1      2      1      3
 
 A molecule file has a header and a body.  The header appears first.  The
-first line of the header and thus of the molecule file is *always* skipped;
-it typically contains a description of the file or a comment from the software
-that created the file.
+first line of the header and thus of the molecule file is *always*
+skipped; it typically contains a description of the file or a comment
+from the software that created the file.
 
 Then lines are read one line at a time.  Lines can have a trailing
 comment starting with '#' that is ignored.  There *must* be at least one
@@ -158,25 +194,62 @@ appear if the value(s) are different than the default, except when
 defining a *body* particle, which requires setting the number of
 *atoms* to 1, and setting the *inertia* in a specific section (see below).
 
-* N *atoms* = # of atoms N in molecule, default = 0
-* Nb *bonds* = # of bonds Nb in molecule, default = 0
-* Na *angles* = # of angles Na in molecule, default = 0
-* Nd *dihedrals* = # of dihedrals Nd in molecule, default = 0
-* Ni *impropers* = # of impropers Ni in molecule, default = 0
-* Nf *fragments* = # of fragments Nf in molecule, default = 0
-* Ninteger Ndouble *body* = # of integer and floating-point values
-  in body particle, default = 0
-* Mtotal *mass* = total mass of molecule
-* Xc Yc Zc *com* = coordinates of center-of-mass of molecule
-* Ixx Iyy Izz Ixy Ixz Iyz *inertia* = 6 components of inertia tensor of molecule
+   .. list-table::
+      :header-rows: 1
+      :widths: auto
 
-For *mass*, *com*, and *inertia*, the default is for LAMMPS to
-calculate this quantity itself if needed, assuming the molecules
-consist of a set of point particles or finite-size particles (with a
-non-zero diameter) that do not overlap.  If finite-size particles in
-the molecule do overlap, LAMMPS will not account for the overlap
-effects when calculating any of these 3 quantities, so you should
-pre-compute them yourself and list the values in the file.
+      * - Number(s)
+        - Keyword
+        - Meaning
+        - Default Value
+      * - N
+        - atoms
+        - # of atoms N in molecule
+        - 0
+      * - Nb
+        - bonds
+        - # of bonds Nb in molecule
+        - 0
+      * - Na
+        - angles
+        - # of angles Na in molecule
+        - 0
+      * - Nd
+        - dihedrals
+        - # of dihedrals Nd in molecule
+        - 0
+      * - Ni
+        - impropers
+        - # of impropers Ni in molecule
+        - 0
+      * - Nf
+        - fragments
+        - # of fragments Nf in molecule
+        - 0
+      * - Ninteger Ndouble
+        - body
+        - # of integer and floating-point values in body particle
+        - 0
+      * - Mtotal
+        - mass
+        - total mass of molecule
+        - computed
+      * - Xc Yc Zc
+        - com
+        - coordinates of center-of-mass of molecule
+        - computed
+      * - Ixx Iyy Izz Ixy Ixz Iyz
+        - inertia
+        - 6 components of inertia tensor of molecule
+        - computed
+
+For *mass*, *com*, and *inertia*, the default is for LAMMPS to calculate
+this quantity itself if needed, assuming the molecules consist of a set
+of point particles or finite-size particles (with a non-zero diameter)
+that do **not** overlap.  If finite-size particles in the molecule
+**do** overlap, LAMMPS will not account for the overlap effects when
+calculating any of these 3 quantities, so you should pre-compute them
+yourself and list the values in the file.
 
 The mass and center-of-mass coordinates (Xc,Yc,Zc) are
 self-explanatory.  The 6 moments of inertia (ixx,iyy,izz,ixy,ixz,iyz)
@@ -188,7 +261,7 @@ internally.
 
 These are the allowed section keywords for the body of the file.
 
-* *Coords, Types, Molecules, Fragments, Charges, Diameters, Masses* = atom-property sections
+* *Coords, Types, Molecules, Fragments, Charges, Diameters, Dipoles, Masses* = atom-property sections
 * *Bonds, Angles, Dihedrals, Impropers* = molecular topology sections
 * *Special Bond Counts, Special Bonds* = special neighbor info
 * *Shake Flags, Shake Atoms, Shake Bond Types* = SHAKE info
@@ -303,6 +376,21 @@ not listed, the default diameter of each atom in the molecule is 1.0.
 
 ----------
 
+.. versionadded:: TBD
+
+*Dipoles* section:
+
+* one line per atom
+* line syntax: ID mux muy muz
+* mux,muy,muz = x-, y-, and z-component of point dipole vector of atom
+
+This section is only allowed for :doc:`atom styles <atom_style>` that
+support particles with point dipoles, e.g. atom_style dipole.  If not
+listed, the default dipole component of each atom in the molecule is set
+to 0.0.
+
+----------
+
 *Masses* section:
 
 * one line per atom
diff --git a/doc/src/pair_charmm.rst b/doc/src/pair_charmm.rst
index 8ff6508dea..30b03ad872 100644
--- a/doc/src/pair_charmm.rst
+++ b/doc/src/pair_charmm.rst
@@ -16,6 +16,7 @@
 .. index:: pair_style lj/charmm/coul/msm/omp
 .. index:: pair_style lj/charmmfsw/coul/charmmfsh
 .. index:: pair_style lj/charmmfsw/coul/long
+.. index:: pair_style lj/charmmfsw/coul/long/kk
 
 pair_style lj/charmm/coul/charmm command
 ========================================
@@ -43,6 +44,8 @@ pair_style lj/charmmfsw/coul/charmmfsh command
 pair_style lj/charmmfsw/coul/long command
 =========================================
 
+Accelerator Variants: *lj/charmmfsw/coul/long/kk*
+
 Syntax
 """"""
 
@@ -281,7 +284,9 @@ page for more info.
 Related commands
 """"""""""""""""
 
-:doc:`pair_coeff <pair_coeff>`
+:doc:`pair_coeff <pair_coeff>`, :doc:`angle_style charmm <angle_charmm>`,
+:doc:`dihedral_style charmm <dihedral_charmm>`,
+:doc:`dihedral_style charmmfsw <dihedral_charmm>`, :doc:`fix cmap <fix_cmap>`
 
 Default
 """""""
diff --git a/doc/src/pair_lepton.rst b/doc/src/pair_lepton.rst
index 21e619a3d9..5b5dc698e7 100644
--- a/doc/src/pair_lepton.rst
+++ b/doc/src/pair_lepton.rst
@@ -72,7 +72,7 @@ interactions between particles which depend on the distance and have a
 cutoff.  The potential function must be provided as an expression string
 using "r" as the distance variable.  With pair style *lepton/coul* one
 may additionally reference the charges of the two atoms of the pair with
-"qi" and "qj", respectively.  With pair style *lepton/coul* one may
+"qi" and "qj", respectively.  With pair style *lepton/sphere* one may
 instead reference the radii of the two atoms of the pair with "radi" and
 "radj", respectively; this is half of the diameter that can be set in
 :doc:`data files <read_data>` or the :doc:`set command <set>`.
@@ -166,8 +166,8 @@ mixing.  Thus, expressions for *all* I,J pairs must be specified
 explicitly.
 
 Only pair style *lepton* supports the :doc:`pair_modify shift <pair_modify>`
-option for shifting the energy of the pair interaction so that it is
-0 at the cutoff, pair styles *lepton/coul* and *lepton/sphere* do *not*.
+option for shifting the potential energy of the pair interaction so that
+it is 0 at the cutoff, pair styles *lepton/coul* and *lepton/sphere* do *not*.
 
 The :doc:`pair_modify table <pair_modify>` options are not relevant for
 the these pair styles.
diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt
index 03e67b95cb..55ac81e04b 100644
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@@ -151,6 +151,7 @@ asphericity
 Asq
 assignee
 assively
+associativity
 Asta
 Astart
 Astop
diff --git a/examples/PACKAGES/cgdna/util/generate.py b/examples/PACKAGES/cgdna/util/generate.py
index cd7465acdb..e85661abb1 100644
--- a/examples/PACKAGES/cgdna/util/generate.py
+++ b/examples/PACKAGES/cgdna/util/generate.py
@@ -22,22 +22,26 @@
 """
 Import basic modules
 """
+
+# for python2/3 compatibility
+from __future__ import print_function
+
 import sys, os, timeit
 
 from timeit import default_timer as timer
 start_time = timer()
 """
-Try to import numpy; if failed, import a local version mynumpy 
+Try to import numpy; if failed, import a local version mynumpy
 which needs to be provided
 """
 try:
     import numpy as np
 except:
-    print >> sys.stderr, "numpy not found. Exiting."
+    print("numpy not found. Exiting.", file=sys.stderr)
     sys.exit(1)
 
 """
-Check that the required arguments (box offset and size in simulation units 
+Check that the required arguments (box offset and size in simulation units
 and the sequence file were provided
 """
 try:
@@ -45,8 +49,8 @@ try:
     box_length = float(sys.argv[2])
     infile = sys.argv[3]
 except:
-    print >> sys.stderr, "Usage: %s <%s> <%s> <%s>" % (sys.argv[0], \
-	"box offset", "box length", "file with sequences")
+    print( "Usage: %s <%s> <%s> <%s>" % (sys.argv[0], \
+	"box offset", "box length", "file with sequences"), file=sys.stderr)
     sys.exit(1)
 box = np.array ([box_length, box_length, box_length])
 
@@ -57,8 +61,7 @@ try:
     inp = open (infile, 'r')
     inp.close()
 except:
-    print >> sys.stderr, "Could not open file '%s' for reading. \
-					      Aborting." % infile
+    print( "Could not open file '%s' for reading. Aborting." % infile, file=sys.stderr)
     sys.exit(2)
 
 # return parts of a string
@@ -86,7 +89,7 @@ Define auxiliary variables for the construction of a helix
 # center of the double strand
 CM_CENTER_DS = POS_BASE + 0.2
 
-# ideal distance between base sites of two nucleotides 
+# ideal distance between base sites of two nucleotides
 # which are to be base paired in a duplex
 BASE_BASE = 0.3897628551303122
 
@@ -118,7 +121,7 @@ strandnum = []
 
 bonds = []
 
-""" 
+"""
 Convert local body frame to quaternion DOF
 """
 def exyz_to_quat (mya1, mya3):
@@ -135,25 +138,25 @@ def exyz_to_quat (mya1, mya3):
     # compute other components from it
 
     if q0sq >= 0.25:
-	myquat[0] = np.sqrt(q0sq)
-	myquat[1] = (mya2[2] - mya3[1]) / (4.0*myquat[0])
-	myquat[2] = (mya3[0] - mya1[2]) / (4.0*myquat[0])
-	myquat[3] = (mya1[1] - mya2[0]) / (4.0*myquat[0])
+        myquat[0] = np.sqrt(q0sq)
+        myquat[1] = (mya2[2] - mya3[1]) / (4.0*myquat[0])
+        myquat[2] = (mya3[0] - mya1[2]) / (4.0*myquat[0])
+        myquat[3] = (mya1[1] - mya2[0]) / (4.0*myquat[0])
     elif q1sq >= 0.25:
-	myquat[1] = np.sqrt(q1sq)
-	myquat[0] = (mya2[2] - mya3[1]) / (4.0*myquat[1])
-	myquat[2] = (mya2[0] + mya1[1]) / (4.0*myquat[1])
-	myquat[3] = (mya1[2] + mya3[0]) / (4.0*myquat[1])
+        myquat[1] = np.sqrt(q1sq)
+        myquat[0] = (mya2[2] - mya3[1]) / (4.0*myquat[1])
+        myquat[2] = (mya2[0] + mya1[1]) / (4.0*myquat[1])
+        myquat[3] = (mya1[2] + mya3[0]) / (4.0*myquat[1])
     elif q2sq >= 0.25:
-	myquat[2] = np.sqrt(q2sq)
-	myquat[0] = (mya3[0] - mya1[2]) / (4.0*myquat[2])
-	myquat[1] = (mya2[0] + mya1[1]) / (4.0*myquat[2])
-	myquat[3] = (mya3[1] + mya2[2]) / (4.0*myquat[2])
+        myquat[2] = np.sqrt(q2sq)
+        myquat[0] = (mya3[0] - mya1[2]) / (4.0*myquat[2])
+        myquat[1] = (mya2[0] + mya1[1]) / (4.0*myquat[2])
+        myquat[3] = (mya3[1] + mya2[2]) / (4.0*myquat[2])
     elif q3sq >= 0.25:
-	myquat[3] = np.sqrt(q3sq)
-	myquat[0] = (mya1[1] - mya2[0]) / (4.0*myquat[3])
-	myquat[1] = (mya3[0] + mya1[2]) / (4.0*myquat[3])
-	myquat[2] = (mya3[1] + mya2[2]) / (4.0*myquat[3])
+        myquat[3] = np.sqrt(q3sq)
+        myquat[0] = (mya1[1] - mya2[0]) / (4.0*myquat[3])
+        myquat[1] = (mya3[0] + mya1[2]) / (4.0*myquat[3])
+        myquat[2] = (mya3[1] + mya2[2]) / (4.0*myquat[3])
 
     norm = 1.0/np.sqrt(myquat[0]*myquat[0] + myquat[1]*myquat[1] + \
 			  myquat[2]*myquat[2] + myquat[3]*myquat[3])
@@ -169,62 +172,62 @@ Adds a strand to the system by appending it to the array of previous strands
 """
 def add_strands (mynewpositions, mynewa1s, mynewa3s):
     overlap = False
-	
-    # This is a simple check for each of the particles where for previously 
-    # placed particles i we check whether it overlaps with any of the 
+
+    # This is a simple check for each of the particles where for previously
+    # placed particles i we check whether it overlaps with any of the
     # newly created particles j
 
-    print >> sys.stdout, "## Checking for overlaps"
+    print( "## Checking for overlaps", file=sys.stdout)
 
-    for i in xrange(len(positions)):
+    for i in range(len(positions)):
 
-	p = positions[i]
-	pa1 = a1s[i]
+        p = positions[i]
+        pa1 = a1s[i]
 
-	for j in xrange (len(mynewpositions)):
+        for j in range (len(mynewpositions)):
 
-	    q = mynewpositions[j]
-	    qa1 = mynewa1s[j]
+            q = mynewpositions[j]
+            qa1 = mynewa1s[j]
 
-	    # skip particles that are anyway too far away
-	    dr = p - q
-	    dr -= box * np.rint (dr / box)
-	    if np.dot(dr, dr) > RC2:
-		continue
+            # skip particles that are anyway too far away
+            dr = p - q
+            dr -= box * np.rint(dr / box)
+            if np.dot(dr, dr) > RC2:
+                continue
 
-	    # base site and backbone site of the two particles
+            # base site and backbone site of the two particles
             p_pos_back = p + pa1 * POS_BACK
             p_pos_base = p + pa1 * POS_BASE
             q_pos_back = q + qa1 * POS_BACK
             q_pos_base = q + qa1 * POS_BASE
 
-	    # check for no overlap between the two backbone sites
+            # check for no overlap between the two backbone sites
             dr = p_pos_back - q_pos_back
-            dr -= box * np.rint (dr / box)
+            dr -= box * np.rint(dr / box)
             if np.dot(dr, dr) < RC2_BACK:
                 overlap = True
 
-	    # check for no overlap between the two base sites
+            # check for no overlap between the two base sites
             dr = p_pos_base -  q_pos_base
-            dr -= box * np.rint (dr / box)
+            dr -= box * np.rint(dr / box)
             if np.dot(dr, dr) < RC2_BASE:
                 overlap = True
 
-	    # check for no overlap between backbone site of particle p 
-	    # with base site of particle q
+            # check for no overlap between backbone site of particle p
+            # with base site of particle q
             dr = p_pos_back - q_pos_base
             dr -= box * np.rint (dr / box)
             if np.dot(dr, dr) < RC2_BACK_BASE:
                 overlap = True
 
-	    # check for no overlap between base site of particle p and 
-	    # backbone site of particle q
+            # check for no overlap between base site of particle p and
+            # backbone site of particle q
             dr = p_pos_base - q_pos_back
             dr -= box * np.rint (dr / box)
             if np.dot(dr, dr) < RC2_BACK_BASE:
                 overlap = True
 
-	    # exit if there is an overlap
+            # exit if there is an overlap
             if overlap:
                 return False
 
@@ -237,10 +240,10 @@ def add_strands (mynewpositions, mynewa1s, mynewa3s):
             a1s.append (p)
         for p in mynewa3s:
             a3s.append (p)
-	# calculate quaternion from local body frame and append
-	for ia in xrange(len(mynewpositions)):
-	    mynewquaternions = exyz_to_quat(mynewa1s[ia],mynewa3s[ia])
-	    quaternions.append(mynewquaternions)
+        # calculate quaternion from local body frame and append
+        for ia in range(len(mynewpositions)):
+            mynewquaternions = exyz_to_quat(mynewa1s[ia],mynewa3s[ia])
+            quaternions.append(mynewquaternions)
 
     return True
 
@@ -281,7 +284,7 @@ def get_rotation_matrix(axis, anglest):
                     [olc*x*z-st*y, olc*y*z+st*x, olc*z*z+ct]])
 
 """
-Generates the position and orientation vectors of a 
+Generates the position and orientation vectors of a
 (single or double) strand from a sequence string
 """
 def generate_strand(bp, sequence=None, start_pos=np.array([0, 0, 0]), \
@@ -295,76 +298,75 @@ def generate_strand(bp, sequence=None, start_pos=np.array([0, 0, 0]), \
     # overall direction of the helix
     dir = np.array(dir, dtype=float)
     if sequence == None:
-	sequence = np.random.randint(1, 5, bp)
+        sequence = np.random.randint(1, 5, bp)
 
-    # the elseif here is most likely redundant 
+    # the elseif here is most likely redundant
     elif len(sequence) != bp:
-	n = bp - len(sequence)
-	sequence += np.random.randint(1, 5, n)
-	print >> sys.stderr, "sequence is too short, adding %d random bases" % n
+        n = bp - len(sequence)
+        sequence += np.random.randint(1, 5, n)
+        print( "sequence is too short, adding %d random bases" % n, file=sys.stderr)
 
     # normalize direction
     dir_norm = np.sqrt(np.dot(dir,dir))
     if dir_norm < 1e-10:
-	print >> sys.stderr, "direction must be a valid vector, \
-			      defaulting to (0, 0, 1)"
-	dir = np.array([0, 0, 1])
+        print( "direction must be a valid vector, defaulting to (0, 0, 1)", file=sys.stderr)
+        dir = np.array([0, 0, 1])
     else: dir /= dir_norm
 
     # find a vector orthogonal to dir to act as helix direction,
     # if not provided switch off random orientation
     if perp is None or perp is False:
-	v1 = np.random.random_sample(3)
-	v1 -= dir * (np.dot(dir, v1))
-	v1 /= np.sqrt(sum(v1*v1))
+        v1 = np.random.random_sample(3)
+        v1 -= dir * (np.dot(dir, v1))
+        v1 /= np.sqrt(sum(v1*v1))
     else:
-	v1 = perp;
+        v1 = perp;
 
     # generate rotational matrix representing the overall rotation of the helix
     R0 = get_rotation_matrix(dir, rot)
-	    
+
     # rotation matrix corresponding to one step along the helix
     R = get_rotation_matrix(dir, [1, "bp"])
 
-    # set the vector a1 (backbone to base) to v1 
+    # set the vector a1 (backbone to base) to v1
     a1 = v1
-    
-    # apply the global rotation to a1 
+
+    # apply the global rotation to a1
     a1 = np.dot(R0, a1)
-    
+
     # set the position of the fist backbone site to start_pos
     rb = np.array(start_pos)
-	    
+
     # set a3 to the direction of the helix
     a3 = dir
     for i in range(bp):
     # work out the position of the centre of mass of the nucleotide
-	rcdm = rb - CM_CENTER_DS * a1
-	
-	# append to newpositions
-	mynewpositions.append(rcdm)
-	mynewa1s.append(a1)
-	mynewa3s.append(a3)
-	
-	# if we are not at the end of the helix, we work out a1 and rb for the 
-	# next nucleotide along the helix
-	if i != bp - 1:
-	    a1 = np.dot(R, a1)
-	    rb += a3 * BASE_BASE
+        rcdm = rb - CM_CENTER_DS * a1
 
-    # if we are working on a double strand, we do a cycle similar 
+        # append to newpositions
+        mynewpositions.append(rcdm)
+        mynewa1s.append(a1)
+        mynewa3s.append(a3)
+
+        # if we are not at the end of the helix, we work out a1 and rb for the
+        # next nucleotide along the helix
+        if i != bp - 1:
+            a1 = np.dot(R, a1)
+            rb += a3 * BASE_BASE
+
+    # if we are working on a double strand, we do a cycle similar
     # to the previous one but backwards
     if double == True:
-	a1 = -a1
-	a3 = -dir
-	R = R.transpose()
-	for i in range(bp):
-	    rcdm = rb - CM_CENTER_DS * a1
-	    mynewpositions.append (rcdm)
-	    mynewa1s.append (a1)
-	    mynewa3s.append (a3)
-	    a1 = np.dot(R, a1)
-	    rb += a3 * BASE_BASE
+        a1 = -a1
+        a3 = -dir
+        R = R.transpose()
+        for i in range(bp):
+            rcdm = rb - CM_CENTER_DS * a1
+            mynewpositions.append (rcdm)
+            mynewa1s.append (a1)
+            mynewa3s.append (a3)
+            a1 = np.dot(R, a1)
+            rb += a3 * BASE_BASE
 
     assert (len (mynewpositions) > 0)
 
@@ -391,10 +393,10 @@ def read_strands(filename):
     try:
         infile = open (filename)
     except:
-        print >> sys.stderr, "Could not open file '%s'. Aborting." % filename
+        print( "Could not open file '%s'. Aborting." % filename, file=sys.stderr )
         sys.exit(2)
 
-    # This block works out the number of nucleotides and strands by reading 
+    # This block works out the number of nucleotides and strands by reading
     # the number of non-empty lines in the input file and the number of letters,
     # taking the possible DOUBLE keyword into account.
     nstrands, nnucl, nbonds = 0, 0, 0
@@ -406,30 +408,29 @@ def read_strands(filename):
         if line[:6] == 'DOUBLE':
             line = line.split()[1]
             length = len(line)
-            print >> sys.stdout, "## Found duplex of %i base pairs" % length
+            print( "## Found duplex of %i base pairs" % length, file=sys.stdout)
             nnucl += 2*length
             nstrands += 2
-	    nbonds += (2*length-2)
+            nbonds += (2*length-2)
         else:
             line = line.split()[0]
             length = len(line)
-            print >> sys.stdout, \
-		    "## Found single strand of %i bases" % length
+            print( "## Found single strand of %i bases" % length, file=sys.stdout)
             nnucl += length
             nstrands += 1
-	    nbonds += length-1
+            nbonds += length-1
     # rewind the sequence input file
     infile.seek(0)
 
-    print >> sys.stdout, "## nstrands, nnucl = ", nstrands, nnucl
+    print( "## nstrands, nnucl = ", nstrands, nnucl, file=sys.stdout)
 
     # generate the data file in LAMMPS format
     try:
         out = open ("data.oxdna", "w")
     except:
-        print >> sys.stderr, "Could not open data file for writing. Aborting."
+        print( "Could not open data file for writing. Aborting.", file=sys.stderr)
         sys.exit(2)
-	
+
     lines = infile.readlines()
     nlines = len(lines)
     i = 1
@@ -440,115 +441,114 @@ def read_strands(filename):
         line = line.upper().strip()
 
         # skip empty lines
-        if len(line) == 0: 
-	    i += 1
-	    continue
+        if len(line) == 0:
+            i += 1
+            continue
 
-	# block for duplexes: last argument of the generate function 
-	# is set to 'True'
+        # block for duplexes: last argument of the generate function
+        # is set to 'True'
         if line[:6] == 'DOUBLE':
             line = line.split()[1]
             length = len(line)
             seq = [(base_to_number[x]) for x in line]
 
-	    myns += 1
-	    for b in xrange(length):
-		basetype.append(seq[b])
-		strandnum.append(myns)
+            myns += 1
+            for b in range(length):
+                basetype.append(seq[b])
+                strandnum.append(myns)
 
-	    for b in xrange(length-1):
-		bondpair = [noffset + b, noffset + b + 1]
-		bonds.append(bondpair)
-	    noffset += length
+            for b in range(length-1):
+                bondpair = [noffset + b, noffset + b + 1]
+                bonds.append(bondpair)
+            noffset += length
 
-	    # create the sequence of the second strand as made of 
-	    # complementary bases
-	    seq2 = [5-s for s in seq]
-	    seq2.reverse()
+            # create the sequence of the second strand as made of
+            # complementary bases
+            seq2 = [5-s for s in seq]
+            seq2.reverse()
 
-	    myns += 1
-	    for b in xrange(length):
-		basetype.append(seq2[b])
-		strandnum.append(myns)
+            myns += 1
+            for b in range(length):
+                basetype.append(seq2[b])
+                strandnum.append(myns)
 
-	    for b in xrange(length-1):
-		bondpair = [noffset + b, noffset + b + 1]
-		bonds.append(bondpair)
-	    noffset += length
- 
-            print >> sys.stdout, "## Created duplex of %i bases" % (2*length)
+            for b in range(length-1):
+                bondpair = [noffset + b, noffset + b + 1]
+                bonds.append(bondpair)
+            noffset += length
 
-	    # generate random position of the first nucleotide
+            print( "## Created duplex of %i bases" % (2*length), file=sys.stdout)
+
+            # generate random position of the first nucleotide
             cdm = box_offset + np.random.random_sample(3) * box
 
-            # generate the random direction of the helix 
+            # generate the random direction of the helix
             axis = np.random.random_sample(3)
             axis /= np.sqrt(np.dot(axis, axis))
 
-            # use the generate function defined above to create 
-	    # the position and orientation vector of the strand 
+            # use the generate function defined above to create
+            # the position and orientation vector of the strand
             newpositions, newa1s, newa3s = generate_strand(len(line), \
-		    sequence=seq, dir=axis, start_pos=cdm, double=True)
+                sequence=seq, dir=axis, start_pos=cdm, double=True)
 
             # generate a new position for the strand until it does not overlap
-	    # with anything already present
-	    start = timer()
+            # with anything already present
+            start = timer()
             while not add_strands(newpositions, newa1s, newa3s):
                 cdm = box_offset + np.random.random_sample(3) * box
                 axis = np.random.random_sample(3)
                 axis /= np.sqrt(np.dot(axis, axis))
                 newpositions, newa1s, newa3s = generate_strand(len(line), \
-		      sequence=seq, dir=axis, start_pos=cdm, double=True)
-                print >> sys.stdout, "## Trying %i" % i
-	    end = timer()
-            print >> sys.stdout, "## Added duplex of %i bases (line %i/%i) in %.2fs, now at %i/%i" % \
-				      (2*length, i, nlines, end-start, len(positions), nnucl)
+                    sequence=seq, dir=axis, start_pos=cdm, double=True)
+                print( "## Trying %i" % i, file=sys.stdout)
+            end = timer()
+            print( "## Added duplex of %i bases (line %i/%i) in %.2fs, now at %i/%i" % \
+				      (2*length, i, nlines, end-start, len(positions), nnucl), file=sys.stdout)
 
-	# block for single strands: last argument of the generate function 
-	# is set to 'False'
+        # block for single strands: last argument of the generate function
+        # is set to 'False'
         else:
             length = len(line)
             seq = [(base_to_number[x]) for x in line]
 
-	    myns += 1
-	    for b in xrange(length):
-		basetype.append(seq[b])
-		strandnum.append(myns)
+            myns += 1
+            for b in range(length):
+                basetype.append(seq[b])
+                strandnum.append(myns)
 
-	    for b in xrange(length-1):
-		bondpair = [noffset + b, noffset + b + 1]
-		bonds.append(bondpair)
-	    noffset += length
+            for b in range(length-1):
+                bondpair = [noffset + b, noffset + b + 1]
+                bonds.append(bondpair)
+            noffset += length
 
-	    # generate random position of the first nucleotide
+            # generate random position of the first nucleotide
             cdm = box_offset + np.random.random_sample(3) * box
 
-            # generate the random direction of the helix 
+            # generate the random direction of the helix
             axis = np.random.random_sample(3)
             axis /= np.sqrt(np.dot(axis, axis))
 
-            print >> sys.stdout, \
-		      "## Created single strand of %i bases" % length
+            print("## Created single strand of %i bases" % length, file=sys.stdout)
 
             newpositions, newa1s, newa3s = generate_strand(length, \
 		      sequence=seq, dir=axis, start_pos=cdm, double=False)
-	    start = timer()
+            start = timer()
             while not add_strands(newpositions, newa1s, newa3s):
                 cdm = box_offset + np.random.random_sample(3) * box
                 axis = np.random.random_sample(3)
-		axis /= np.sqrt(np.dot(axis, axis))
+                axis /= np.sqrt(np.dot(axis, axis))
                 newpositions, newa1s, newa3s = generate_strand(length, \
-			  sequence=seq, dir=axis, start_pos=cdm, double=False)
+                    sequence=seq, dir=axis, start_pos=cdm, double=False)
                 print >> sys.stdout, "## Trying  %i" % (i)
-	    end = timer()
-            print >> sys.stdout, "## Added single strand of %i bases (line %i/%i) in %.2fs, now at %i/%i" % \
-				      (length, i, nlines, end-start,len(positions), nnucl)
+            end = timer()
+            print( "## Added single strand of %i bases (line %i/%i) in %.2fs, now at %i/%i" % \
+				      (length, i, nlines, end-start,len(positions), nnucl), file=sys.stdout)
 
         i += 1
 
     # sanity check
     if not len(positions) == nnucl:
-        print len(positions), nnucl
+        print( len(positions), nnucl )
         raise AssertionError
 
     out.write('# LAMMPS data file\n')
@@ -580,44 +580,41 @@ def read_strands(filename):
     out.write('Atoms\n')
     out.write('\n')
 
-    for i in xrange(nnucl):
-	out.write('%d %d %22.15le %22.15le %22.15le %d 1 1\n' \
-		  % (i+1, basetype[i], \
-		     positions[i][0], positions[i][1], positions[i][2], \
-		     strandnum[i]))
+    for i in range(nnucl):
+        out.write('%d %d %22.15le %22.15le %22.15le %d 1 1\n' \
+            % (i+1, basetype[i], positions[i][0], positions[i][1], positions[i][2], strandnum[i]))
 
     out.write('\n')
     out.write('# Atom-ID, translational, rotational velocity\n')
     out.write('Velocities\n')
     out.write('\n')
 
-    for i in xrange(nnucl):
-	out.write("%d %22.15le %22.15le %22.15le %22.15le %22.15le %22.15le\n" \
-		  % (i+1,0.0,0.0,0.0,0.0,0.0,0.0))
+    for i in range(nnucl):
+        out.write("%d %22.15le %22.15le %22.15le %22.15le %22.15le %22.15le\n" \
+            % (i+1,0.0,0.0,0.0,0.0,0.0,0.0))
 
     out.write('\n')
     out.write('# Atom-ID, shape, quaternion\n')
     out.write('Ellipsoids\n')
     out.write('\n')
 
-    for i in xrange(nnucl):
-	out.write(\
-    "%d %22.15le %22.15le %22.15le %22.15le %22.15le %22.15le %22.15le\n"  \
-      % (i+1,1.1739845031423408,1.1739845031423408,1.1739845031423408, \
-	quaternions[i][0],quaternions[i][1], quaternions[i][2],quaternions[i][3]))
- 
+    for i in range(nnucl):
+        out.write("%d %22.15le %22.15le %22.15le %22.15le %22.15le %22.15le %22.15le\n"  \
+            % (i+1,1.1739845031423408,1.1739845031423408,1.1739845031423408, \
+            quaternions[i][0],quaternions[i][1], quaternions[i][2],quaternions[i][3]))
+
     out.write('\n')
     out.write('# Bond topology\n')
     out.write('Bonds\n')
     out.write('\n')
 
-    for i in xrange(nbonds):
-	out.write("%d  %d  %d  %d\n" % (i+1,1,bonds[i][0],bonds[i][1]))
+    for i in range(nbonds):
+        out.write("%d  %d  %d  %d\n" % (i+1,1,bonds[i][0],bonds[i][1]))
 
     out.close()
 
-    print >> sys.stdout, "## Wrote data to 'data.oxdna'"
-    print >> sys.stdout, "## DONE"
+    print("## Wrote data to 'data.oxdna'", file=sys.stdout)
+    print("## DONE", file=sys.stdout)
 
 # call the above main() function, which executes the program
 read_strands (infile)
@@ -627,4 +624,6 @@ runtime = end_time-start_time
 hours = runtime/3600
 minutes = (runtime-np.rint(hours)*3600)/60
 seconds = (runtime-np.rint(hours)*3600-np.rint(minutes)*60)%60
-print >> sys.stdout, "## Total runtime %ih:%im:%.2fs" % (hours,minutes,seconds)
+print( "## Total runtime %ih:%im:%.2fs" % (hours,minutes,seconds), file=sys.stdout)
+
+
diff --git a/examples/PACKAGES/cgdna/util/generate_simple.py b/examples/PACKAGES/cgdna/util/generate_simple.py
index 33cf1ee7f5..7702bfc7f5 100644
--- a/examples/PACKAGES/cgdna/util/generate_simple.py
+++ b/examples/PACKAGES/cgdna/util/generate_simple.py
@@ -1,5 +1,8 @@
 # Setup tool for oxDNA input in LAMMPS format.
 
+# for python2/3 compatibility
+from __future__ import print_function
+
 import math,numpy as np,sys,os
 
 # system size
@@ -250,59 +253,59 @@ def duplex_array():
       qrot3=math.sin(0.5*twist)
 
       for letter in strand[2]:
-	temp1=[]
-	temp2=[]
+        temp1=[]
+        temp2=[]
 
-	temp1.append(nt2num[letter])
-	temp2.append(compnt2num[letter])
+        temp1.append(nt2num[letter])
+        temp2.append(compnt2num[letter])
 
-	temp1.append([posx1,posy1,posz1])
-	temp2.append([posx2,posy2,posz2])
+        temp1.append([posx1,posy1,posz1])
+        temp2.append([posx2,posy2,posz2])
 
-	vel=[0,0,0,0,0,0]
-	temp1.append(vel)
-	temp2.append(vel)
+        vel=[0,0,0,0,0,0]
+        temp1.append(vel)
+        temp2.append(vel)
 
-	temp1.append(shape)
-	temp2.append(shape)
+        temp1.append(shape)
+        temp2.append(shape)
 
-	temp1.append(quat1)
-	temp2.append(quat2)
+        temp1.append(quat1)
+        temp2.append(quat2)
 
-	quat1_0 = quat1[0]*qrot0 - quat1[1]*qrot1 - quat1[2]*qrot2 - quat1[3]*qrot3 
-	quat1_1 = quat1[0]*qrot1 + quat1[1]*qrot0 + quat1[2]*qrot3 - quat1[3]*qrot2 
-	quat1_2 = quat1[0]*qrot2 + quat1[2]*qrot0 + quat1[3]*qrot1 - quat1[1]*qrot3 
-	quat1_3 = quat1[0]*qrot3 + quat1[3]*qrot0 + quat1[1]*qrot2 + quat1[2]*qrot1 
+        quat1_0 = quat1[0]*qrot0 - quat1[1]*qrot1 - quat1[2]*qrot2 - quat1[3]*qrot3
+        quat1_1 = quat1[0]*qrot1 + quat1[1]*qrot0 + quat1[2]*qrot3 - quat1[3]*qrot2
+        quat1_2 = quat1[0]*qrot2 + quat1[2]*qrot0 + quat1[3]*qrot1 - quat1[1]*qrot3
+        quat1_3 = quat1[0]*qrot3 + quat1[3]*qrot0 + quat1[1]*qrot2 + quat1[2]*qrot1
 
-	quat1 = [quat1_0,quat1_1,quat1_2,quat1_3]
+        quat1 = [quat1_0,quat1_1,quat1_2,quat1_3]
 
-	posx1=axisx - dcomh*(quat1[0]**2+quat1[1]**2-quat1[2]**2-quat1[3]**2)
-	posy1=axisy - dcomh*(2*(quat1[1]*quat1[2]+quat1[0]*quat1[3]))
-	posz1=posz1+risez
+        posx1=axisx - dcomh*(quat1[0]**2+quat1[1]**2-quat1[2]**2-quat1[3]**2)
+        posy1=axisy - dcomh*(2*(quat1[1]*quat1[2]+quat1[0]*quat1[3]))
+        posz1=posz1+risez
 
-	quat2_0 = quat2[0]*qrot0 - quat2[1]*qrot1 - quat2[2]*qrot2 + quat2[3]*qrot3 
-	quat2_1 = quat2[0]*qrot1 + quat2[1]*qrot0 - quat2[2]*qrot3 - quat2[3]*qrot2 
-	quat2_2 = quat2[0]*qrot2 + quat2[2]*qrot0 + quat2[3]*qrot1 + quat2[1]*qrot3 
-	quat2_3 =-quat2[0]*qrot3 + quat2[3]*qrot0 + quat2[1]*qrot2 + quat2[2]*qrot1 
+        quat2_0 = quat2[0]*qrot0 - quat2[1]*qrot1 - quat2[2]*qrot2 + quat2[3]*qrot3
+        quat2_1 = quat2[0]*qrot1 + quat2[1]*qrot0 - quat2[2]*qrot3 - quat2[3]*qrot2
+        quat2_2 = quat2[0]*qrot2 + quat2[2]*qrot0 + quat2[3]*qrot1 + quat2[1]*qrot3
+        quat2_3 =-quat2[0]*qrot3 + quat2[3]*qrot0 + quat2[1]*qrot2 + quat2[2]*qrot1
 
-	quat2 = [quat2_0,quat2_1,quat2_2,quat2_3]
+        quat2 = [quat2_0,quat2_1,quat2_2,quat2_3]
 
-	posx2=axisx + dcomh*(quat1[0]**2+quat1[1]**2-quat1[2]**2-quat1[3]**2)
-	posy2=axisy + dcomh*(2*(quat1[1]*quat1[2]+quat1[0]*quat1[3]))
-	posz2=posz1
+        posx2=axisx + dcomh*(quat1[0]**2+quat1[1]**2-quat1[2]**2-quat1[3]**2)
+        posy2=axisy + dcomh*(2*(quat1[1]*quat1[2]+quat1[0]*quat1[3]))
+        posz2=posz1
 
-	if (len(nucleotide)+1 > strandstart):
-	  topology.append([1,len(nucleotide),len(nucleotide)+1])
-	  comptopo.append([1,len(nucleotide)+len(strand[2]),len(nucleotide)+len(strand[2])+1])
+        if (len(nucleotide)+1 > strandstart):
+          topology.append([1,len(nucleotide),len(nucleotide)+1])
+          comptopo.append([1,len(nucleotide)+len(strand[2]),len(nucleotide)+len(strand[2])+1])
 
-	nucleotide.append(temp1)
-	compstrand.append(temp2)
+        nucleotide.append(temp1)
+        compstrand.append(temp2)
 
       for ib in range(len(compstrand)):
-	nucleotide.append(compstrand[len(compstrand)-1-ib])
+        nucleotide.append(compstrand[len(compstrand)-1-ib])
 
       for ib in range(len(comptopo)):
-	topology.append(comptopo[ib])
+        topology.append(comptopo[ib])
 
   return
 
diff --git a/examples/PACKAGES/reaction/create_atoms_polystyrene/in.grow_styrene b/examples/PACKAGES/reaction/create_atoms_polystyrene/in.grow_styrene
index 7860db4e55..dcca29c026 100644
--- a/examples/PACKAGES/reaction/create_atoms_polystyrene/in.grow_styrene
+++ b/examples/PACKAGES/reaction/create_atoms_polystyrene/in.grow_styrene
@@ -40,7 +40,7 @@ fix 1 statted_grp_REACT nvt temp $T $T 100
 
 fix 4 bond_react_MASTER_group temp/rescale 1 $T $T 1 1
 
-thermo_style custom step temp press density f_myrxns[1]
+thermo_style custom step temp press density f_myrxns[*]
 
 thermo 100
 
diff --git a/examples/PACKAGES/reaction/nylon,6-6_melt/in.large_nylon_melt b/examples/PACKAGES/reaction/nylon,6-6_melt/in.large_nylon_melt
index 9678a714d6..635b2c9750 100644
--- a/examples/PACKAGES/reaction/nylon,6-6_melt/in.large_nylon_melt
+++ b/examples/PACKAGES/reaction/nylon,6-6_melt/in.large_nylon_melt
@@ -26,7 +26,7 @@ read_data large_nylon_melt.data.gz &
   extra/angle/per/atom 15 &
   extra/dihedral/per/atom 15 &
   extra/improper/per/atom 25 &
-  extra/special/per/atom 25 
+  extra/special/per/atom 25
 
 velocity all create 800.0 4928459 dist gaussian
 
@@ -50,7 +50,7 @@ fix 1 statted_grp_REACT nvt temp 800 800 100
 # you can use the internally created 'bond_react_MASTER_group', like so:
 # fix 2 bond_react_MASTER_group temp/rescale 1 800 800 10 1
 
-thermo_style custom step temp press density f_myrxns[1] f_myrxns[2] # cumulative reaction counts
+thermo_style custom step temp press density f_myrxns[*] # cumulative reaction counts
 
 # restart 100 restart1 restart2
 
diff --git a/examples/PACKAGES/reaction/tiny_epoxy/in.tiny_epoxy.stabilized b/examples/PACKAGES/reaction/tiny_epoxy/in.tiny_epoxy.stabilized
index 57b03b630f..7e0350cdb0 100644
--- a/examples/PACKAGES/reaction/tiny_epoxy/in.tiny_epoxy.stabilized
+++ b/examples/PACKAGES/reaction/tiny_epoxy/in.tiny_epoxy.stabilized
@@ -20,7 +20,8 @@ improper_style class2
 special_bonds lj/coul 0 0 1
 pair_modify tail yes mix sixthpower
 
-read_data tiny_epoxy.data
+read_data tiny_epoxy.data &
+  extra/special/per/atom 25
 
 velocity all create 300.0 4928459 dist gaussian
 
@@ -44,7 +45,7 @@ fix rxns all bond/react stabilization yes statted_grp .03 &
 
 fix 1 statted_grp_REACT nvt temp 300 300 100
 
-thermo_style custom step temp f_rxns[1] f_rxns[2] f_rxns[3] f_rxns[4]
+thermo_style custom step temp f_rxns[*]
 
 run 2000
 
diff --git a/examples/PACKAGES/reaction/tiny_nylon/in.tiny_nylon.stabilized b/examples/PACKAGES/reaction/tiny_nylon/in.tiny_nylon.stabilized
index 95b39033db..853bc45f1e 100644
--- a/examples/PACKAGES/reaction/tiny_nylon/in.tiny_nylon.stabilized
+++ b/examples/PACKAGES/reaction/tiny_nylon/in.tiny_nylon.stabilized
@@ -50,7 +50,7 @@ fix 1 statted_grp_REACT nvt temp 300 300 100
 # by using the internally-created 'bond_react_MASTER_group', like so:
 fix 4 bond_react_MASTER_group temp/rescale 1 300 300 10 1
 
-thermo_style custom step temp press density f_myrxns[1] f_myrxns[2]
+thermo_style custom step temp press density f_myrxns[*]
 
 # restart 100 restart1 restart2
 
diff --git a/examples/PACKAGES/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability b/examples/PACKAGES/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability
index 88b5a95a41..f3c32f3cbd 100644
--- a/examples/PACKAGES/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability
+++ b/examples/PACKAGES/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability
@@ -54,7 +54,7 @@ fix 1 statted_grp_REACT nvt temp 300 300 100
 # by using the internally-created 'bond_react_MASTER_group', like so:
 fix 4 bond_react_MASTER_group temp/rescale 1 300 300 10 1
 
-thermo_style custom step temp press density v_prob1 v_prob2 f_myrxns[1] f_myrxns[2]
+thermo_style custom step temp press density v_prob1 v_prob2 f_myrxns[*]
 
 # restart 100 restart1 restart2
 
diff --git a/examples/PACKAGES/reaction/tiny_nylon/in.tiny_nylon.unstabilized b/examples/PACKAGES/reaction/tiny_nylon/in.tiny_nylon.unstabilized
index a569e28d43..e5cbaaaf86 100644
--- a/examples/PACKAGES/reaction/tiny_nylon/in.tiny_nylon.unstabilized
+++ b/examples/PACKAGES/reaction/tiny_nylon/in.tiny_nylon.unstabilized
@@ -47,7 +47,7 @@ fix myrxns all bond/react stabilization no &
 
 fix 1 all nve/limit .03
 
-thermo_style custom step temp press density f_myrxns[1] f_myrxns[2]
+thermo_style custom step temp press density f_myrxns[*]
 
 # restart 100 restart1 restart2
 
diff --git a/examples/PACKAGES/reaction/tiny_polystyrene/in.tiny_polystyrene.stabilized b/examples/PACKAGES/reaction/tiny_polystyrene/in.tiny_polystyrene.stabilized
index 4ecc481719..230998fcd3 100644
--- a/examples/PACKAGES/reaction/tiny_polystyrene/in.tiny_polystyrene.stabilized
+++ b/examples/PACKAGES/reaction/tiny_polystyrene/in.tiny_polystyrene.stabilized
@@ -51,7 +51,7 @@ fix 1 statted_grp_REACT nvt temp $T $T 100
 
 fix 4 bond_react_MASTER_group temp/rescale 1 $T $T 1 1
 
-thermo_style custom step temp press density f_rxn1[1] f_rxn1[2] f_rxn1[3]
+thermo_style custom step temp press density f_rxn1[*]
 
 run 10000
 
diff --git a/lib/gpu/lal_base_sph.h b/lib/gpu/lal_base_sph.h
index e1e5731573..d37e85f170 100644
--- a/lib/gpu/lal_base_sph.h
+++ b/lib/gpu/lal_base_sph.h
@@ -15,7 +15,7 @@
  ***************************************************************************/
 
 #ifndef LAL_BASE_SPH_H
-#define LAL_BASE_DPD_H
+#define LAL_BASE_SPH_H
 
 #include "lal_device.h"
 #include "lal_balance.h"
diff --git a/lib/gpu/lal_coul_slater_long.cu b/lib/gpu/lal_coul_slater_long.cu
index 1fc8ab8be4..49cf47b8b3 100644
--- a/lib/gpu/lal_coul_slater_long.cu
+++ b/lib/gpu/lal_coul_slater_long.cu
@@ -102,6 +102,7 @@ __kernel void k_coul_slater_long(const __global numtyp4 *restrict x_,
         numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
         _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
         fetch(prefactor,j,q_tex);
+        prefactor *= qqrd2e * scale[mtype] * qtmp/r;
         numtyp rlamdainv = r * lamdainv;
         numtyp exprlmdainv = ucl_exp((numtyp)-2.0*rlamdainv);
         numtyp slater_term = exprlmdainv*((numtyp)1.0 + ((numtyp)2.0*rlamdainv*((numtyp)1.0+rlamdainv)));
diff --git a/lib/gpu/lal_coul_slater_long.h b/lib/gpu/lal_coul_slater_long.h
index 8950fd81ef..1731992a16 100644
--- a/lib/gpu/lal_coul_slater_long.h
+++ b/lib/gpu/lal_coul_slater_long.h
@@ -13,8 +13,8 @@
     email                : ndactrung@gmail.com
  ***************************************************************************/
 
-#ifndef LAL_Coul_Slater_Long_H
-#define LAL_Coul_Slater_Long_H
+#ifndef LAL_COUL_SLATER_LONG_H
+#define LAL_COUL_SLATER_LONG_H
 
 #include "lal_base_charge.h"
 
diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp
index b7bc7b958a..0a2ed21ab3 100644
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@@ -303,7 +303,7 @@ double EAMT::host_memory_usage() const {
 }
 
 // ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then compute atom energies/forces
+// Copy nbor list from host if necessary and then compute per-atom fp
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
@@ -379,7 +379,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
 }
 
 // ---------------------------------------------------------------------------
-// Reneighbor on GPU and then compute per-atom densities
+// Reneighbor on GPU and then compute per-atom fp
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int** EAMT::compute(const int ago, const int inum_full, const int nall,
@@ -461,7 +461,7 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
 }
 
 // ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials,..
+// Update per-atom fp, and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void EAMT::compute2(int *ilist, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_sph_heatconduction.h b/lib/gpu/lal_sph_heatconduction.h
index 23241e8c92..cd7a46e3bd 100644
--- a/lib/gpu/lal_sph_heatconduction.h
+++ b/lib/gpu/lal_sph_heatconduction.h
@@ -13,8 +13,8 @@
     email                : ndactrung@gmail.com
  ***************************************************************************/
 
-#ifndef LAL_SPH_LJ_H
-#define LAL_SPH_LJ_H
+#ifndef LAL_SPH_HEATCONDUCTION_H
+#define LAL_SPH_HEATCONDUCTION_H
 
 #include "lal_base_sph.h"
 
diff --git a/lib/kokkos/core/src/Kokkos_Printf.hpp b/lib/kokkos/core/src/Kokkos_Printf.hpp
index 39f95825c3..af20221a5a 100644
--- a/lib/kokkos/core/src/Kokkos_Printf.hpp
+++ b/lib/kokkos/core/src/Kokkos_Printf.hpp
@@ -31,7 +31,7 @@ namespace Kokkos {
 // backends. The GPU backends always return 1 and NVHPC only compiles if we
 // don't ask for the return value.
 template <typename... Args>
-KOKKOS_FUNCTION void printf(const char* format, Args... args) {
+KOKKOS_FORCEINLINE_FUNCTION void printf(const char* format, Args... args) {
 #ifdef KOKKOS_ENABLE_SYCL
   // Some compilers warn if "args" is empty and format is not a string literal
   if constexpr (sizeof...(Args) == 0)
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp
index 03f5fff395..4586406e16 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp
@@ -219,8 +219,6 @@ KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions,
     Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions,
                                   partition_size);
 
-    OpenMP::memory_space space;
-
 #pragma omp parallel num_threads(num_partitions)
     {
       Exec thread_local_instance(partition_size);
diff --git a/src/.gitignore b/src/.gitignore
index 112a1486f7..1e4c5b9ddb 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -348,6 +348,8 @@
 /compute_nbond_atom.h
 /fix_nve_bpm_sphere.cpp
 /fix_nve_bpm_sphere.h
+/fix_update_special_bonds.cpp
+/fix_update_special_bonds.h
 /pair_bpm_spring.cpp
 /pair_bpm_spring.h
 
diff --git a/src/BPM/bond_bpm.cpp b/src/BPM/bond_bpm.cpp
index 3ebeed3f1d..b484df7fab 100644
--- a/src/BPM/bond_bpm.cpp
+++ b/src/BPM/bond_bpm.cpp
@@ -224,7 +224,7 @@ void BondBPM::settings(int narg, char **arg)
 
       ifix = modify->get_fix_by_id(id_fix_prop_atom);
       if (!ifix)
-        ifix = modify->add_fix(fmt::format("{} all property/atom {} {} {} ghost yes",
+        ifix = modify->add_fix(fmt::format("{} all property/atom d_{} d_{} d_{} ghost yes",
                                            id_fix_prop_atom, x_ref_id, y_ref_id, z_ref_id));
 
       int type_flag;
diff --git a/src/fix_update_special_bonds.cpp b/src/BPM/fix_update_special_bonds.cpp
similarity index 73%
rename from src/fix_update_special_bonds.cpp
rename to src/BPM/fix_update_special_bonds.cpp
index 159b2a1170..b6bf8b433f 100644
--- a/src/fix_update_special_bonds.cpp
+++ b/src/BPM/fix_update_special_bonds.cpp
@@ -20,6 +20,7 @@
 #include "force.h"
 #include "modify.h"
 #include "neigh_list.h"
+#include "neighbor.h"
 #include "pair.h"
 
 #include <utility>
@@ -61,7 +62,8 @@ void FixUpdateSpecialBonds::setup(int /*vflag*/)
   // Require atoms know about all of their bonds and if they break
   if (force->newton_bond) error->all(FLERR, "Fix update/special/bonds requires Newton bond off");
 
-  if (!atom->avec->bonds_allow) error->all(FLERR, "Fix update/special/bonds requires atom bonds");
+  if (!atom->avec->bonds_allow)
+    error->all(FLERR, "Fix update/special/bonds requires an atom style supporting bonds");
 
   // special lj must be 0 1 1 to censor pair forces between bonded particles
   // special coulomb must be 1 1 1 to ensure all pairs are included in the
@@ -72,9 +74,6 @@ void FixUpdateSpecialBonds::setup(int /*vflag*/)
       force->special_coul[3] != 1.0)
     error->all(FLERR, "Fix update/special/bonds requires special Coulomb weights = 1,1,1");
   // Implies neighbor->special_flag = [X, 2, 1, 1]
-
-  if (utils::strmatch(force->pair_style, "^hybrid"))
-    error->all(FLERR, "Cannot use fix update/special/bonds with hybrid pair styles");
 }
 
 /* ----------------------------------------------------------------------
@@ -158,69 +157,86 @@ void FixUpdateSpecialBonds::pre_force(int /*vflag*/)
   int i1, i2, j, jj, jnum;
   int *jlist, *numneigh, **firstneigh;
   tagint tag1, tag2;
+  NeighList *list;
 
   int nlocal = atom->nlocal;
-
   tagint *tag = atom->tag;
-  NeighList *list = force->pair->list;    // may need to be generalized for pair hybrid*
-  numneigh = list->numneigh;
-  firstneigh = list->firstneigh;
 
   // In theory could communicate a list of broken bonds to neighboring processors here
   // to remove restriction that users use Newton bond off
 
-  for (auto const &it : new_broken_pairs) {
-    tag1 = it.first;
-    tag2 = it.second;
-    i1 = atom->map(tag1);
-    i2 = atom->map(tag2);
+  for (int ilist = 0; ilist < neighbor->nlist; ilist++) {
+    list = neighbor->lists[ilist];
 
-    // Loop through atoms of owned atoms i j
-    if (i1 < nlocal) {
-      jlist = firstneigh[i1];
-      jnum = numneigh[i1];
-      for (jj = 0; jj < jnum; jj++) {
-        j = jlist[jj];
-        j &= SPECIALMASK;    // Clear special bond bits
-        if (tag[j] == tag2) jlist[jj] = j;
+    // Skip copied lists, will update original
+    if (list->copy) continue;
+
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+
+    for (auto const &it : new_broken_pairs) {
+      tag1 = it.first;
+      tag2 = it.second;
+      i1 = atom->map(tag1);
+      i2 = atom->map(tag2);
+
+      // Loop through atoms of owned atoms i j
+      if (i1 < nlocal) {
+        jlist = firstneigh[i1];
+        jnum = numneigh[i1];
+        for (jj = 0; jj < jnum; jj++) {
+          j = jlist[jj];
+          j &= SPECIALMASK;    // Clear special bond bits
+          if (tag[j] == tag2) jlist[jj] = j;
+        }
       }
-    }
 
-    if (i2 < nlocal) {
-      jlist = firstneigh[i2];
-      jnum = numneigh[i2];
-      for (jj = 0; jj < jnum; jj++) {
-        j = jlist[jj];
-        j &= SPECIALMASK;    // Clear special bond bits
-        if (tag[j] == tag1) jlist[jj] = j;
+      if (i2 < nlocal) {
+        jlist = firstneigh[i2];
+        jnum = numneigh[i2];
+        for (jj = 0; jj < jnum; jj++) {
+          j = jlist[jj];
+          j &= SPECIALMASK;    // Clear special bond bits
+          if (tag[j] == tag1) jlist[jj] = j;
+        }
       }
     }
   }
 
-  for (auto const &it : new_created_pairs) {
-    tag1 = it.first;
-    tag2 = it.second;
-    i1 = atom->map(tag1);
-    i2 = atom->map(tag2);
+  for (int ilist = 0; ilist < neighbor->nlist; ilist++) {
+    list = neighbor->lists[ilist];
 
-    // Loop through atoms of owned atoms i j and update SB bits
-    if (i1 < nlocal) {
-      jlist = firstneigh[i1];
-      jnum = numneigh[i1];
-      for (jj = 0; jj < jnum; jj++) {
-        j = jlist[jj];
-        if (((j >> SBBITS) & 3) != 0) continue;               // Skip bonded pairs
-        if (tag[j] == tag2) jlist[jj] = j ^ (1 << SBBITS);    // Add 1-2 special bond bits
+    // Skip copied lists, will update original
+    if (list->copy) continue;
+
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+
+    for (auto const &it : new_created_pairs) {
+      tag1 = it.first;
+      tag2 = it.second;
+      i1 = atom->map(tag1);
+      i2 = atom->map(tag2);
+
+      // Loop through atoms of owned atoms i j and update SB bits
+      if (i1 < nlocal) {
+        jlist = firstneigh[i1];
+        jnum = numneigh[i1];
+        for (jj = 0; jj < jnum; jj++) {
+          j = jlist[jj];
+          if (((j >> SBBITS) & 3) != 0) continue;               // Skip bonded pairs
+          if (tag[j] == tag2) jlist[jj] = j ^ (1 << SBBITS);    // Add 1-2 special bond bits
+        }
       }
-    }
 
-    if (i2 < nlocal) {
-      jlist = firstneigh[i2];
-      jnum = numneigh[i2];
-      for (jj = 0; jj < jnum; jj++) {
-        j = jlist[jj];
-        if (((j >> SBBITS) & 3) != 0) continue;               // Skip bonded pairs
-        if (tag[j] == tag1) jlist[jj] = j ^ (1 << SBBITS);    // Add 1-2 special bond bits
+      if (i2 < nlocal) {
+        jlist = firstneigh[i2];
+        jnum = numneigh[i2];
+        for (jj = 0; jj < jnum; jj++) {
+          j = jlist[jj];
+          if (((j >> SBBITS) & 3) != 0) continue;               // Skip bonded pairs
+          if (tag[j] == tag1) jlist[jj] = j ^ (1 << SBBITS);    // Add 1-2 special bond bits
+        }
       }
     }
   }
diff --git a/src/fix_update_special_bonds.h b/src/BPM/fix_update_special_bonds.h
similarity index 100%
rename from src/fix_update_special_bonds.h
rename to src/BPM/fix_update_special_bonds.h
diff --git a/src/BPM/pair_bpm_spring.cpp b/src/BPM/pair_bpm_spring.cpp
index 1177156359..01cee91b4c 100644
--- a/src/BPM/pair_bpm_spring.cpp
+++ b/src/BPM/pair_bpm_spring.cpp
@@ -19,6 +19,7 @@
 #include "force.h"
 #include "memory.h"
 #include "neigh_list.h"
+#include "neighbor.h"
 
 #include <cmath>
 
@@ -202,6 +203,18 @@ void PairBPMSpring::coeff(int narg, char **arg)
   if (count == 0) error->all(FLERR, "Incorrect args for pair coefficients");
 }
 
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairBPMSpring::init_style()
+{
+  if (comm->ghost_velocity == 0)
+    error->all(FLERR,"Pair bpm/spring requires ghost atoms store velocity");
+
+  neighbor->add_request(this);
+}
+
 /* ----------------------------------------------------------------------
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
diff --git a/src/BPM/pair_bpm_spring.h b/src/BPM/pair_bpm_spring.h
index 3cb281bff3..c10e4a3400 100644
--- a/src/BPM/pair_bpm_spring.h
+++ b/src/BPM/pair_bpm_spring.h
@@ -31,6 +31,7 @@ class PairBPMSpring : public Pair {
   void compute(int, int) override;
   void settings(int, char **) override;
   void coeff(int, char **) override;
+  void init_style() override;
   double init_one(int, int) override;
   void write_restart(FILE *) override;
   void read_restart(FILE *) override;
diff --git a/src/Depend.sh b/src/Depend.sh
index dbffb2dba0..3df1347e67 100755
--- a/src/Depend.sh
+++ b/src/Depend.sh
@@ -99,6 +99,7 @@ fi
 
 if (test $1 = "EXTRA-PAIR") then
   depend GPU
+  depend KOKKOS
   depend OPENMP
 fi
 
diff --git a/src/EFF/fix_langevin_eff.cpp b/src/EFF/fix_langevin_eff.cpp
index 8c255e4348..a25b6ac837 100644
--- a/src/EFF/fix_langevin_eff.cpp
+++ b/src/EFF/fix_langevin_eff.cpp
@@ -137,7 +137,7 @@ void FixLangevinEff::post_force_no_tally()
   dof = domain->dimension * particles;
   fix_dof = 0;
   for (int i = 0; i < modify->nfix; i++)
-    fix_dof += modify->fix[i]->dof(igroup);
+    fix_dof += (int)modify->fix[i]->dof(igroup);
 
   // extra_dof = domain->dimension
   dof -= domain->dimension + fix_dof;
@@ -306,7 +306,7 @@ void FixLangevinEff::post_force_tally()
   dof = domain->dimension * particles;
   fix_dof = 0;
   for (int i = 0; i < modify->nfix; i++)
-    fix_dof += modify->fix[i]->dof(igroup);
+    fix_dof += (int)modify->fix[i]->dof(igroup);
 
   // extra_dof = domain->dimension
   dof -= domain->dimension + fix_dof;
diff --git a/src/ELECTRODE/pppm_electrode.cpp b/src/ELECTRODE/pppm_electrode.cpp
index 6ede0f1f4d..0ae3da6863 100644
--- a/src/ELECTRODE/pppm_electrode.cpp
+++ b/src/ELECTRODE/pppm_electrode.cpp
@@ -633,7 +633,9 @@ void PPPMElectrode::project_psi(double *vec, int sensor_grpbit)
   // project u_brick with weight matrix
   double **x = atom->x;
   int *mask = atom->mask;
-  double const scaleinv = 1.0 / (nx_pppm * ny_pppm * nz_pppm);
+  const bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  const double scaleinv = 1.0 / ngridtotal;
+
   for (int i = 0; i < atom->nlocal; i++) {
     if (!(mask[i] & sensor_grpbit)) continue;
     double v = 0.;
@@ -1362,7 +1364,7 @@ double PPPMElectrode::compute_qopt()
   // each proc calculates contributions from every Pth grid point
 
   bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
-  int nxy_pppm = nx_pppm * ny_pppm;
+  bigint nxy_pppm = (bigint) nx_pppm * ny_pppm;
 
   double qopt = 0.0;
 
diff --git a/src/EXTRA-COMPUTE/compute_rattlers_atom.cpp b/src/EXTRA-COMPUTE/compute_rattlers_atom.cpp
index 602923b58a..9dacf14171 100644
--- a/src/EXTRA-COMPUTE/compute_rattlers_atom.cpp
+++ b/src/EXTRA-COMPUTE/compute_rattlers_atom.cpp
@@ -144,7 +144,6 @@ void ComputeRattlersAtom::compute_peratom()
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
 
-  Pair *pair = force->pair;
   double **cutsq = force->pair->cutsq;
 
   int change_flag = 1;
diff --git a/src/EXTRA-COMPUTE/compute_rattlers_atom.h b/src/EXTRA-COMPUTE/compute_rattlers_atom.h
index 257bae8374..79a0a0a982 100644
--- a/src/EXTRA-COMPUTE/compute_rattlers_atom.h
+++ b/src/EXTRA-COMPUTE/compute_rattlers_atom.h
@@ -38,8 +38,7 @@ class ComputeRattlersAtom : public Compute {
   void unpack_reverse_comm(int, int *, double *) override;
 
  private:
-  int pstyle, cutstyle;
-  int ncontacts_rattler, max_tries, nmax, invoked_peratom;
+  int cutstyle, ncontacts_rattler, max_tries, nmax, invoked_peratom;
   int *ncontacts;
   double *rattler;
   class NeighList *list;
diff --git a/src/EXTRA-COMPUTE/compute_slcsa_atom.cpp b/src/EXTRA-COMPUTE/compute_slcsa_atom.cpp
index 509362a73b..6c272938b6 100644
--- a/src/EXTRA-COMPUTE/compute_slcsa_atom.cpp
+++ b/src/EXTRA-COMPUTE/compute_slcsa_atom.cpp
@@ -33,21 +33,20 @@
 
 #include <cmath>
 #include <cstring>
-#include <iostream>
 
 using namespace LAMMPS_NS;
 
 static const char cite_compute_slcsa_atom_c[] =
-    "compute slcsa/atom command: doi:10.1088/0965-0393/21/5/055020\n\n"
+    "compute slcsa/atom command: doi:10.1016/j.commatsci.2023.112534\n\n"
     "@Article{Lafourcade2023,\n"
     " author = {P. Lafourcade and J.-B. Maillet and C. Denoual and E. Duval and A. Allera and A. "
     "M. Goryaeva and M.-C. Marinica},\n"
     " title = {Robust crystal structure identification at extreme conditions using a "
     "density-independent spectral descriptor and supervised learning},\n"
     " journal = {Computational Materials Science},\n"
-    " year =    2023,\n"
-    " volume =  XX,\n"
-    " pages =   {XXXXXX}\n"
+    " year = 2023,\n"
+    " volume = 230,\n"
+    " pages = 112534\n"
     "}\n\n";
 
 /* ---------------------------------------------------------------------- */
@@ -79,6 +78,8 @@ ComputeSLCSAAtom::ComputeSLCSAAtom(LAMMPS *lmp, int narg, char **arg) :
   // # LR bias vector
   // vector with 1 row x nclasses cols
 
+  if (lmp->citeme) lmp->citeme->add(cite_compute_slcsa_atom_c);
+
   if (narg != 11) utils::missing_cmd_args(FLERR, "compute slcsa/atom", error);
 
   int twojmax = utils::inumeric(FLERR, arg[3], false, lmp);
diff --git a/src/EXTRA-COMPUTE/compute_slcsa_atom.h b/src/EXTRA-COMPUTE/compute_slcsa_atom.h
index 6d7cd90c31..ba373a53a8 100644
--- a/src/EXTRA-COMPUTE/compute_slcsa_atom.h
+++ b/src/EXTRA-COMPUTE/compute_slcsa_atom.h
@@ -53,14 +53,12 @@ class ComputeSLCSAAtom : public Compute {
   value_t descriptorval;
   int nmax;
   int ncols;
-  int nevery;
   int ncomps;
   int nclasses;
   const char *database_mean_descriptor_file;
   const char *lda_scalings_file;
   const char *lr_decision_file;
   const char *lr_bias_file;
-  const char *covmat_file;
   const char *maha_file;
   class NeighList *list;
 
diff --git a/src/EXTRA-FIX/fix_ave_correlate_long.cpp b/src/EXTRA-FIX/fix_ave_correlate_long.cpp
index 7fa57af343..fc1760b353 100644
--- a/src/EXTRA-FIX/fix_ave_correlate_long.cpp
+++ b/src/EXTRA-FIX/fix_ave_correlate_long.cpp
@@ -503,7 +503,7 @@ void FixAveCorrelateLong::end_of_step()
     if (overwrite) {
       bigint fileend = platform::ftell(fp);
       if ((fileend > 0) && (platform::ftruncate(fp,fileend)))
-        error->warning(FLERR,"Error while tuncating output: {}", utils::getsyserror());
+        error->warning(FLERR,"Error while truncating output: {}", utils::getsyserror());
     }
   }
 }
@@ -728,7 +728,7 @@ double FixAveCorrelateLong::memory_usage() {
 void FixAveCorrelateLong::write_restart(FILE *fp) {
   if (comm->me == 0) {
     int nsize = 3*npair*numcorrelators*p + 2*npair*numcorrelators
-                + numcorrelators*p + 2*numcorrelators + 6;
+                + numcorrelators*p + 2*numcorrelators + 7;
     int n=0;
     double *list;
     memory->create(list,nsize,"correlator:list");
@@ -736,6 +736,7 @@ void FixAveCorrelateLong::write_restart(FILE *fp) {
     list[n++] = numcorrelators;
     list[n++] = p;
     list[n++] = m;
+    list[n++] = kmax;
     list[n++] = last_accumulated_step;
     for (int i=0; i < npair; i++)
       for (int j=0; j < numcorrelators; j++) {
@@ -771,6 +772,7 @@ void FixAveCorrelateLong::restart(char *buf)
   int numcorrelatorsin = static_cast<int> (list[n++]);
   int pin = static_cast<int>(list[n++]);
   int min = static_cast<int>(list[n++]);
+  kmax = static_cast<int>(list[n++]);
   last_accumulated_step = static_cast<int>(list[n++]);
 
   if ((npairin!=npair) || (numcorrelatorsin!=numcorrelators) || (pin!=(int)p) || (min!=(int)m))
diff --git a/src/EXTRA-FIX/fix_nonaffine_displacement.cpp b/src/EXTRA-FIX/fix_nonaffine_displacement.cpp
index c1de50c41d..a426a8fb55 100644
--- a/src/EXTRA-FIX/fix_nonaffine_displacement.cpp
+++ b/src/EXTRA-FIX/fix_nonaffine_displacement.cpp
@@ -202,7 +202,7 @@ void FixNonaffineDisplacement::init()
     // need an occasional half neighbor list
 
     if (cut_style == RADIUS) {
-      auto req = neighbor->add_request(this, NeighConst::REQ_SIZE | NeighConst::REQ_OCCASIONAL);
+      neighbor->add_request(this, NeighConst::REQ_SIZE | NeighConst::REQ_OCCASIONAL);
     } else {
       auto req = neighbor->add_request(this, NeighConst::REQ_OCCASIONAL);
       if (cut_style == CUSTOM) {
@@ -233,7 +233,7 @@ void FixNonaffineDisplacement::init_list(int /*id*/, NeighList *ptr)
 
 /* ---------------------------------------------------------------------- */
 
-void FixNonaffineDisplacement::setup(int vflag)
+void FixNonaffineDisplacement::setup(int /*vflag*/)
 {
   post_force(0); // Save state if needed before starting the 1st timestep
 }
@@ -285,7 +285,6 @@ void FixNonaffineDisplacement::restart(char *buf)
 
 void FixNonaffineDisplacement::integrate_velocity()
 {
-  int i,n;
   dtv = update->dt;
 
   double **v = atom->v;
@@ -306,7 +305,6 @@ void FixNonaffineDisplacement::integrate_velocity()
 
 void FixNonaffineDisplacement::save_reference_state()
 {
-  int i, n;
   double **x = atom->x;
 
   int *mask = atom->mask;
@@ -354,15 +352,14 @@ void FixNonaffineDisplacement::calculate_D2Min()
 
   int i, j, k, l, ii, jj, inum, jnum, itype, jtype;
   double evol, j2, edev;
-  double r[3], r0[3], rsq, rsq0, radsum, temp[3];
+  double r[3], r0[3], rsq, radsum, temp[3];
   double X_tmp[3][3], Y_tmp[3][3], F_tmp[3][3], E[3][3];
-  double Y_inv[3][3] = {0.0}; // Zero for 2d since not all entries used
+  double Y_inv[3][3] = {{0.0,0.0,0.0},{0.0,0.0,0.0},{0.0,0.0,0.0}}; // Zero for 2d since not all entries used
   int *ilist, *jlist, *numneigh, **firstneigh;
 
   double **x = atom->x;
   double **x0 = array_atom;
   double *radius = atom->radius;
-  tagint *tag = atom->tag;
   int *type = atom->type;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
diff --git a/src/EXTRA-MOLECULE/angle_cosine_periodic.cpp b/src/EXTRA-MOLECULE/angle_cosine_periodic.cpp
index 34a8e9d8e5..245a7b8d58 100644
--- a/src/EXTRA-MOLECULE/angle_cosine_periodic.cpp
+++ b/src/EXTRA-MOLECULE/angle_cosine_periodic.cpp
@@ -120,7 +120,7 @@ void AngleCosinePeriodic::compute(int eflag, int vflag)
     tn = 1.0;
     tn_1 = 1.0;
     tn_2 = 0.0;
-    un = 1.0;
+    un = (m==1) ? 2.0 : 1.0;
     un_1 = 2.0;
     un_2 = 0.0;
 
diff --git a/src/EXTRA-MOLECULE/dihedral_quadratic.cpp b/src/EXTRA-MOLECULE/dihedral_quadratic.cpp
index f576e6efdd..a7c0dc3eb1 100644
--- a/src/EXTRA-MOLECULE/dihedral_quadratic.cpp
+++ b/src/EXTRA-MOLECULE/dihedral_quadratic.cpp
@@ -338,7 +338,7 @@ void DihedralQuadratic::born_matrix(int nd, int i1, int i2, int i3, int i4,
   double sb1,sb3,rb1,rb3,c0,b1mag2,b1mag,b2mag2;
   double b2mag,b3mag2,b3mag,ctmp,r12c1,c1mag,r12c2;
   double c2mag,sc1,sc2,s12,c;
-  double s1,s2,cx,cy,cz,cmag,dx,phi,si,siinv,sin2;
+  double cx,cy,cz,cmag,dx,phi,si,siinv,sin2;
 
   int **dihedrallist = neighbor->dihedrallist;
   double **x = atom->x;
@@ -405,8 +405,6 @@ void DihedralQuadratic::born_matrix(int nd, int i1, int i2, int i3, int i4,
   if (sc2 < SMALL) sc2 = SMALL;
   sc2 = 1.0/sc2;
 
-  s1 = sc1 * sc1;
-  s2 = sc2 * sc2;
   s12 = sc1 * sc2;
   c = (c0 + c1mag*c2mag) * s12;
 
diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp
index a2c733e7ed..1959f00865 100644
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
@@ -405,7 +405,8 @@ void PPPMGPU::poisson_ik()
 
   // if requested, compute energy and virial contribution
 
-  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  double scaleinv = 1.0 / ngridtotal;
   double s2 = scaleinv*scaleinv;
 
   if (eflag_global || vflag_global) {
diff --git a/src/INTEL/pppm_electrode_intel.cpp b/src/INTEL/pppm_electrode_intel.cpp
index 5cb62dc5d2..4d8a0331b8 100644
--- a/src/INTEL/pppm_electrode_intel.cpp
+++ b/src/INTEL/pppm_electrode_intel.cpp
@@ -420,7 +420,9 @@ void PPPMElectrodeIntel::project_psi(IntelBuffers<flt_t, acc_t> *buffers, double
 #endif
   {
     int *mask = atom->mask;
-    const flt_t scaleinv = 1.0 / (nx_pppm * ny_pppm * nz_pppm);
+
+    const bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+    const flt_t scaleinv = 1.0 / ngridtotal;
 
     const flt_t lo0 = boxlo[0];
     const flt_t lo1 = boxlo[1];
diff --git a/src/KIM/kim_interactions.cpp b/src/KIM/kim_interactions.cpp
index 1f4f84e648..ce550bf5da 100644
--- a/src/KIM/kim_interactions.cpp
+++ b/src/KIM/kim_interactions.cpp
@@ -70,6 +70,8 @@
 #include "modify.h"
 #include "update.h"
 
+#include "fmt/ranges.h"
+
 #include <cstring>
 #include <vector>
 
diff --git a/src/KIM/kim_param.cpp b/src/KIM/kim_param.cpp
index f72df81989..c50474fe67 100644
--- a/src/KIM/kim_param.cpp
+++ b/src/KIM/kim_param.cpp
@@ -68,6 +68,8 @@
 #include "pair_kim.h"
 #include "variable.h"
 
+#include "fmt/ranges.h"
+
 #include <cstdlib>
 #include <cstring>
 #include <vector>
diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index ba0deedb45..112a2e947a 100755
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -106,6 +106,8 @@ action compute_temp_kokkos.cpp
 action compute_temp_kokkos.h
 action dihedral_charmm_kokkos.cpp dihedral_charmm.cpp
 action dihedral_charmm_kokkos.h dihedral_charmm.h
+action dihedral_charmmfsw_kokkos.cpp dihedral_charmmfsw.cpp
+action dihedral_charmmfsw_kokkos.h dihedral_charmmfsw.h
 action dihedral_class2_kokkos.cpp dihedral_class2.cpp
 action dihedral_class2_kokkos.h dihedral_class2.h
 action dihedral_harmonic_kokkos.cpp dihedral_harmonic.cpp
@@ -311,6 +313,8 @@ action pair_lj_charmm_coul_charmm_kokkos.cpp pair_lj_charmm_coul_charmm.cpp
 action pair_lj_charmm_coul_charmm_kokkos.h pair_lj_charmm_coul_charmm.h
 action pair_lj_charmm_coul_long_kokkos.cpp pair_lj_charmm_coul_long.cpp
 action pair_lj_charmm_coul_long_kokkos.h pair_lj_charmm_coul_long.h
+action pair_lj_charmmfsw_coul_long_kokkos.cpp pair_lj_charmmfsw_coul_long.cpp
+action pair_lj_charmmfsw_coul_long_kokkos.h pair_lj_charmmfsw_coul_long.h
 action pair_lj_class2_coul_cut_kokkos.cpp pair_lj_class2_coul_cut.cpp
 action pair_lj_class2_coul_cut_kokkos.h pair_lj_class2_coul_cut.h
 action pair_lj_class2_coul_long_kokkos.cpp pair_lj_class2_coul_long.cpp
diff --git a/src/KOKKOS/atom_kokkos.cpp b/src/KOKKOS/atom_kokkos.cpp
index c55c1d315b..501b719ad4 100644
--- a/src/KOKKOS/atom_kokkos.cpp
+++ b/src/KOKKOS/atom_kokkos.cpp
@@ -31,7 +31,9 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-AtomKokkos::AtomKokkos(LAMMPS *lmp) : Atom(lmp)
+AtomKokkos::AtomKokkos(LAMMPS *lmp) : Atom(lmp),
+mapBinner(1, 0.0, 1.0), // no default constructor, these values are not used
+mapSorter(d_tag_sorted, 0, 1, mapBinner, true)
 {
   avecKK = nullptr;
 
@@ -300,7 +302,7 @@ void AtomKokkos::grow(unsigned int mask)
 
 int AtomKokkos::add_custom(const char *name, int flag, int cols)
 {
-  int index;
+  int index = -1;
 
   if (flag == 0 && cols == 0) {
     index = nivector;
diff --git a/src/KOKKOS/atom_kokkos.h b/src/KOKKOS/atom_kokkos.h
index 6a3036375d..e6269b5527 100644
--- a/src/KOKKOS/atom_kokkos.h
+++ b/src/KOKKOS/atom_kokkos.h
@@ -103,7 +103,8 @@ class AtomKokkos : public Atom {
 
   using MapKeyViewType = decltype(d_tag_sorted);
   using BinOpMap = Kokkos::BinOp1D<MapKeyViewType>;
-  Kokkos::BinSort<MapKeyViewType, BinOpMap> Sorter;
+  BinOpMap mapBinner;
+  Kokkos::BinSort<MapKeyViewType, BinOpMap> mapSorter;
 
   class AtomVecKokkos* avecKK;
 
diff --git a/src/KOKKOS/atom_map_kokkos.cpp b/src/KOKKOS/atom_map_kokkos.cpp
index 06516e4142..4f46c33dbe 100644
--- a/src/KOKKOS/atom_map_kokkos.cpp
+++ b/src/KOKKOS/atom_map_kokkos.cpp
@@ -146,7 +146,7 @@ void AtomKokkos::map_set()
   int nmax = atom->nmax;
 
   int realloc_flag = 0;
-  if (d_tag_sorted.extent(0) < nmax) {
+  if (!d_tag_sorted.data() || (int)d_tag_sorted.extent(0) < nmax) {
     MemKK::realloc_kokkos(d_tag_sorted,"atom:tag_sorted",nmax);
     MemKK::realloc_kokkos(d_i_sorted,"atom:i_sorted",nmax);
     realloc_flag = 1;
@@ -179,25 +179,25 @@ void AtomKokkos::map_set()
   using MapKeyViewType = decltype(d_tag_sorted);
   using BinOpMap = Kokkos::BinOp1D<MapKeyViewType>;
 
-  auto binner = BinOpMap(nall, min, max);
+  mapBinner = BinOpMap(nall, min, max);
 
-  if (!Sorter.bin_offsets.data() || realloc_flag) {
-    Sorter = Kokkos::BinSort<MapKeyViewType, BinOpMap>(d_tag_sorted, 0, nall, binner, true);
-    MemKK::realloc_kokkos(Sorter.bin_count_atomic,"Kokkos::SortImpl::BinSortFunctor::bin_count",nmax+1);
-    Kokkos::deep_copy(Sorter.bin_count_atomic,0);
-    Sorter.bin_count_const = Sorter.bin_count_atomic;
-    MemKK::realloc_kokkos(Sorter.bin_offsets,"Kokkos::SortImpl::BinSortFunctor::bin_offsets",nmax+1);
-    MemKK::realloc_kokkos(Sorter.sort_order,"Kokkos::SortImpl::BinSortFunctor::sort_order",nmax);
+  if (realloc_flag) {
+    mapSorter = Kokkos::BinSort<MapKeyViewType, BinOpMap>(d_tag_sorted, 0, nall, mapBinner, true);
+    MemKK::realloc_kokkos(mapSorter.bin_count_atomic,"Kokkos::SortImpl::BinSortFunctor::bin_count",nmax+1);
+    Kokkos::deep_copy(mapSorter.bin_count_atomic,0);
+    mapSorter.bin_count_const = mapSorter.bin_count_atomic;
+    MemKK::realloc_kokkos(mapSorter.bin_offsets,"Kokkos::SortImpl::BinSortFunctor::bin_offsets",nmax+1);
+    MemKK::realloc_kokkos(mapSorter.sort_order,"Kokkos::SortImpl::BinSortFunctor::sort_order",nmax);
   } else {
-    Kokkos::deep_copy(Sorter.bin_count_atomic,0);
-    Sorter.bin_op = binner;
-    Sorter.range_begin = 0;
-    Sorter.range_end = nall;
+    Kokkos::deep_copy(mapSorter.bin_count_atomic,0);
+    mapSorter.bin_op = mapBinner;
+    mapSorter.range_begin = 0;
+    mapSorter.range_end = nall;
   }
 
-  Sorter.create_permute_vector(LMPDeviceType());
-  Sorter.sort(LMPDeviceType(), d_tag_sorted, 0, nall);
-  Sorter.sort(LMPDeviceType(), d_i_sorted, 0, nall);
+  mapSorter.create_permute_vector(LMPDeviceType());
+  mapSorter.sort(LMPDeviceType(), d_tag_sorted, 0, nall);
+  mapSorter.sort(LMPDeviceType(), d_i_sorted, 0, nall);
 
   auto d_map_array = k_map_array.d_view;
   auto d_map_hash = k_map_hash.d_view;
@@ -273,6 +273,7 @@ void AtomKokkos::map_set()
     error->one(FLERR,"Failed to insert into Kokkos hash atom map");
 
   k_sametag.modify_device();
+  k_sametag.sync_host();
 
   if (map_style == MAP_ARRAY)
     k_map_array.modify_device();
diff --git a/src/KOKKOS/atom_vec_angle_kokkos.cpp b/src/KOKKOS/atom_vec_angle_kokkos.cpp
index dd6be164c0..418c2d629d 100644
--- a/src/KOKKOS/atom_vec_angle_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_angle_kokkos.cpp
@@ -680,7 +680,6 @@ struct AtomVecAngleKokkos_PackExchangeFunctor {
       const typename AT::tdual_xfloat_2d buf,
       typename AT::tdual_int_1d sendlist,
       typename AT::tdual_int_1d copylist):
-    _size_exchange(atom->avecKK->size_exchange),
     _x(atom->k_x.view<DeviceType>()),
     _v(atom->k_v.view<DeviceType>()),
     _tag(atom->k_tag.view<DeviceType>()),
@@ -716,7 +715,8 @@ struct AtomVecAngleKokkos_PackExchangeFunctor {
     _angle_atom2w(atom->k_angle_atom2.view<DeviceType>()),
     _angle_atom3w(atom->k_angle_atom3.view<DeviceType>()),
     _sendlist(sendlist.template view<DeviceType>()),
-    _copylist(copylist.template view<DeviceType>()) {
+    _copylist(copylist.template view<DeviceType>()),
+    _size_exchange(atom->avecKK->size_exchange) {
     const int maxsendlist = (buf.template view<DeviceType>().extent(0)*
                              buf.template view<DeviceType>().extent(1))/_size_exchange;
     buffer_view<DeviceType>(_buf,buf,maxsendlist,_size_exchange);
@@ -858,7 +858,6 @@ struct AtomVecAngleKokkos_UnpackExchangeFunctor {
       const typename AT::tdual_xfloat_2d buf,
       typename AT::tdual_int_1d nlocal,
       int dim, X_FLOAT lo, X_FLOAT hi):
-    _size_exchange(atom->avecKK->size_exchange),
     _x(atom->k_x.view<DeviceType>()),
     _v(atom->k_v.view<DeviceType>()),
     _tag(atom->k_tag.view<DeviceType>()),
@@ -876,8 +875,8 @@ struct AtomVecAngleKokkos_UnpackExchangeFunctor {
     _angle_atom1(atom->k_angle_atom1.view<DeviceType>()),
     _angle_atom2(atom->k_angle_atom2.view<DeviceType>()),
     _angle_atom3(atom->k_angle_atom3.view<DeviceType>()),
-    _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
-    _lo(lo),_hi(hi) {
+    _nlocal(nlocal.template view<DeviceType>()),
+    _dim(dim),_lo(lo),_hi(hi),_size_exchange(atom->avecKK->size_exchange) {
     const int maxsendlist = (buf.template view<DeviceType>().extent(0)*
                              buf.template view<DeviceType>().extent(1))/_size_exchange;
     buffer_view<DeviceType>(_buf,buf,maxsendlist,_size_exchange);
@@ -927,7 +926,7 @@ struct AtomVecAngleKokkos_UnpackExchangeFunctor {
 
 int AtomVecAngleKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv, int nlocal,
                                                int dim, X_FLOAT lo, X_FLOAT hi, ExecutionSpace space,
-                                               DAT::tdual_int_1d &k_indices)
+                                               DAT::tdual_int_1d &/*k_indices*/)
 {
   while (nlocal + nrecv/size_exchange >= nmax) grow(0);
 
diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.cpp b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
index 1ea8377a68..973ad2f7f2 100644
--- a/src/KOKKOS/atom_vec_atomic_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
@@ -294,7 +294,6 @@ struct AtomVecAtomicKokkos_PackExchangeFunctor {
     const typename AT::tdual_xfloat_2d buf,
     typename AT::tdual_int_1d sendlist,
     typename AT::tdual_int_1d copylist):
-    _size_exchange(atom->avecKK->size_exchange),
     _x(atom->k_x.view<DeviceType>()),
     _v(atom->k_v.view<DeviceType>()),
     _tag(atom->k_tag.view<DeviceType>()),
@@ -308,7 +307,8 @@ struct AtomVecAtomicKokkos_PackExchangeFunctor {
     _maskw(atom->k_mask.view<DeviceType>()),
     _imagew(atom->k_image.view<DeviceType>()),
     _sendlist(sendlist.template view<DeviceType>()),
-    _copylist(copylist.template view<DeviceType>()) {
+    _copylist(copylist.template view<DeviceType>()),
+    _size_exchange(atom->avecKK->size_exchange) {
     const int maxsendlist = (buf.template view<DeviceType>().extent(0)*buf.template view<DeviceType>().extent(1))/_size_exchange;
 
     buffer_view<DeviceType>(_buf,buf,maxsendlist,_size_exchange);
@@ -392,16 +392,15 @@ struct AtomVecAtomicKokkos_UnpackExchangeFunctor {
     typename AT::tdual_int_1d nlocal,
     typename AT::tdual_int_1d indices,
     int dim, X_FLOAT lo, X_FLOAT hi):
-      _size_exchange(atom->avecKK->size_exchange),
       _x(atom->k_x.view<DeviceType>()),
       _v(atom->k_v.view<DeviceType>()),
       _tag(atom->k_tag.view<DeviceType>()),
       _type(atom->k_type.view<DeviceType>()),
       _mask(atom->k_mask.view<DeviceType>()),
       _image(atom->k_image.view<DeviceType>()),
-      _indices(indices.template view<DeviceType>()),
-      _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
-      _lo(lo),_hi(hi) {
+      _nlocal(nlocal.template view<DeviceType>()),
+      _indices(indices.template view<DeviceType>()),_dim(dim),
+      _lo(lo),_hi(hi),_size_exchange(atom->avecKK->size_exchange) {
         const int maxsendlist = (buf.template view<DeviceType>().extent(0)*
                                  buf.template view<DeviceType>().extent(1))/_size_exchange;
         buffer_view<DeviceType>(_buf,buf,maxsendlist,_size_exchange);
diff --git a/src/KOKKOS/atom_vec_bond_kokkos.cpp b/src/KOKKOS/atom_vec_bond_kokkos.cpp
index c45bdedf38..a4fd9ca1b5 100644
--- a/src/KOKKOS/atom_vec_bond_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_bond_kokkos.cpp
@@ -352,7 +352,6 @@ struct AtomVecBondKokkos_PackExchangeFunctor {
     const typename AT::tdual_xfloat_2d buf,
     typename AT::tdual_int_1d sendlist,
     typename AT::tdual_int_1d copylist):
-      _size_exchange(atom->avecKK->size_exchange),
       _x(atom->k_x.view<DeviceType>()),
       _v(atom->k_v.view<DeviceType>()),
       _tag(atom->k_tag.view<DeviceType>()),
@@ -378,7 +377,8 @@ struct AtomVecBondKokkos_PackExchangeFunctor {
       _bond_typew(atom->k_bond_type.view<DeviceType>()),
       _bond_atomw(atom->k_bond_atom.view<DeviceType>()),
       _sendlist(sendlist.template view<DeviceType>()),
-      _copylist(copylist.template view<DeviceType>()) {
+      _copylist(copylist.template view<DeviceType>()),
+      _size_exchange(atom->avecKK->size_exchange) {
     const int maxsendlist = (buf.template view<DeviceType>().extent(0)*
                              buf.template view<DeviceType>().extent(1))/_size_exchange;
     buffer_view<DeviceType>(_buf,buf,maxsendlist,_size_exchange);
@@ -503,7 +503,6 @@ struct AtomVecBondKokkos_UnpackExchangeFunctor {
     typename AT::tdual_int_1d nlocal,
     typename AT::tdual_int_1d indices,
     int dim, X_FLOAT lo, X_FLOAT hi):
-      _size_exchange(atom->avecKK->size_exchange),
       _x(atom->k_x.view<DeviceType>()),
       _v(atom->k_v.view<DeviceType>()),
       _tag(atom->k_tag.view<DeviceType>()),
@@ -516,9 +515,9 @@ struct AtomVecBondKokkos_UnpackExchangeFunctor {
       _num_bond(atom->k_num_bond.view<DeviceType>()),
       _bond_type(atom->k_bond_type.view<DeviceType>()),
       _bond_atom(atom->k_bond_atom.view<DeviceType>()),
+      _nlocal(nlocal.template view<DeviceType>()),
       _indices(indices.template view<DeviceType>()),
-      _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
-      _lo(lo),_hi(hi) {
+      _dim(dim),_lo(lo),_hi(hi),_size_exchange(atom->avecKK->size_exchange) {
         const int maxsendlist = (buf.template view<DeviceType>().extent(0)*
                                  buf.template view<DeviceType>().extent(1))/_size_exchange;
         buffer_view<DeviceType>(_buf,buf,maxsendlist,_size_exchange);
diff --git a/src/KOKKOS/atom_vec_charge_kokkos.cpp b/src/KOKKOS/atom_vec_charge_kokkos.cpp
index 22fc63ff91..4fa814f1ac 100644
--- a/src/KOKKOS/atom_vec_charge_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_charge_kokkos.cpp
@@ -366,7 +366,6 @@ struct AtomVecChargeKokkos_PackExchangeFunctor {
       const typename AT::tdual_xfloat_2d buf,
       typename AT::tdual_int_1d sendlist,
       typename AT::tdual_int_1d copylist):
-    _size_exchange(atom->avecKK->size_exchange),
     _x(atom->k_x.view<DeviceType>()),
     _v(atom->k_v.view<DeviceType>()),
     _tag(atom->k_tag.view<DeviceType>()),
@@ -382,7 +381,8 @@ struct AtomVecChargeKokkos_PackExchangeFunctor {
     _imagew(atom->k_image.view<DeviceType>()),
     _qw(atom->k_q.view<DeviceType>()),
     _sendlist(sendlist.template view<DeviceType>()),
-    _copylist(copylist.template view<DeviceType>()) {
+    _copylist(copylist.template view<DeviceType>()),
+    _size_exchange(atom->avecKK->size_exchange) {
     const int maxsendlist = (buf.template view<DeviceType>().extent(0)*
                              buf.template view<DeviceType>().extent(1))/_size_exchange;
 
@@ -474,17 +474,16 @@ struct AtomVecChargeKokkos_UnpackExchangeFunctor {
     typename AT::tdual_int_1d nlocal,
     typename AT::tdual_int_1d indices,
     int dim, X_FLOAT lo, X_FLOAT hi):
-      _size_exchange(atom->avecKK->size_exchange),
       _x(atom->k_x.view<DeviceType>()),
       _v(atom->k_v.view<DeviceType>()),
       _tag(atom->k_tag.view<DeviceType>()),
       _type(atom->k_type.view<DeviceType>()),
       _mask(atom->k_mask.view<DeviceType>()),
       _image(atom->k_image.view<DeviceType>()),
-      _indices(indices.template view<DeviceType>()),
       _q(atom->k_q.view<DeviceType>()),
-      _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
-      _lo(lo),_hi(hi) {
+      _nlocal(nlocal.template view<DeviceType>()),
+      _indices(indices.template view<DeviceType>()),_dim(dim),
+      _lo(lo),_hi(hi),_size_exchange(atom->avecKK->size_exchange) {
     const int maxsendlist = (buf.template view<DeviceType>().extent(0)*buf.template view<DeviceType>().extent(1))/_size_exchange;
 
     buffer_view<DeviceType>(_buf,buf,maxsendlist,_size_exchange);
diff --git a/src/KOKKOS/atom_vec_dipole_kokkos.cpp b/src/KOKKOS/atom_vec_dipole_kokkos.cpp
index ad06570cdc..ecc0f3b497 100644
--- a/src/KOKKOS/atom_vec_dipole_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_dipole_kokkos.cpp
@@ -398,7 +398,6 @@ struct AtomVecDipoleKokkos_PackExchangeFunctor {
       const typename AT::tdual_xfloat_2d buf,
       typename AT::tdual_int_1d sendlist,
       typename AT::tdual_int_1d copylist):
-    _size_exchange(atom->avecKK->size_exchange),
     _x(atom->k_x.view<DeviceType>()),
     _v(atom->k_v.view<DeviceType>()),
     _tag(atom->k_tag.view<DeviceType>()),
@@ -416,7 +415,8 @@ struct AtomVecDipoleKokkos_PackExchangeFunctor {
     _qw(atom->k_q.view<DeviceType>()),
     _muw(atom->k_mu.view<DeviceType>()),
     _sendlist(sendlist.template view<DeviceType>()),
-    _copylist(copylist.template view<DeviceType>()) {
+    _copylist(copylist.template view<DeviceType>()),
+    _size_exchange(atom->avecKK->size_exchange) {
     const int maxsendlist = (buf.template view<DeviceType>().extent(0)*
                              buf.template view<DeviceType>().extent(1))/_size_exchange;
 
@@ -515,7 +515,6 @@ struct AtomVecDipoleKokkos_UnpackExchangeFunctor {
       const typename AT::tdual_xfloat_2d buf,
       typename AT::tdual_int_1d nlocal,
       int dim, X_FLOAT lo, X_FLOAT hi):
-      _size_exchange(atom->avecKK->size_exchange),
       _x(atom->k_x.view<DeviceType>()),
       _v(atom->k_v.view<DeviceType>()),
       _tag(atom->k_tag.view<DeviceType>()),
@@ -524,8 +523,8 @@ struct AtomVecDipoleKokkos_UnpackExchangeFunctor {
       _image(atom->k_image.view<DeviceType>()),
       _q(atom->k_q.view<DeviceType>()),
       _mu(atom->k_mu.view<DeviceType>()),
-      _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
-      _lo(lo),_hi(hi) {
+      _nlocal(nlocal.template view<DeviceType>()),
+      _dim(dim),_lo(lo),_hi(hi),_size_exchange(atom->avecKK->size_exchange) {
     const int maxsendlist = (buf.template view<DeviceType>().extent(0)*buf.template view<DeviceType>().extent(1))/_size_exchange;
 
     buffer_view<DeviceType>(_buf,buf,maxsendlist,_size_exchange);
@@ -557,8 +556,8 @@ struct AtomVecDipoleKokkos_UnpackExchangeFunctor {
 
 /* ---------------------------------------------------------------------- */
 int AtomVecDipoleKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv, int nlocal,
-                                               int dim, X_FLOAT lo, X_FLOAT hi, ExecutionSpace space,
-                                               DAT::tdual_int_1d &k_indices)
+                                                int dim, X_FLOAT lo, X_FLOAT hi, ExecutionSpace space,
+                                                DAT::tdual_int_1d &/*k_indices*/)
 {
   if (space == Host) {
     k_count.h_view(0) = nlocal;
diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.cpp b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
index c3430b9f6e..70aedcc931 100644
--- a/src/KOKKOS/atom_vec_dpd_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
@@ -746,7 +746,6 @@ struct AtomVecDPDKokkos_PackExchangeFunctor {
       const typename AT::tdual_xfloat_2d buf,
       typename AT::tdual_int_1d sendlist,
       typename AT::tdual_int_1d copylist):
-                _size_exchange(atom->avecKK->size_exchange),
                 _x(atom->k_x.view<DeviceType>()),
                 _v(atom->k_v.view<DeviceType>()),
                 _tag(atom->k_tag.view<DeviceType>()),
@@ -772,7 +771,8 @@ struct AtomVecDPDKokkos_PackExchangeFunctor {
                 _uCGw(atom->k_uCG.view<DeviceType>()),
                 _uCGneww(atom->k_uCGnew.view<DeviceType>()),
                 _sendlist(sendlist.template view<DeviceType>()),
-                _copylist(copylist.template view<DeviceType>()) {
+                _copylist(copylist.template view<DeviceType>()),
+                _size_exchange(atom->avecKK->size_exchange) {
     const int maxsendlist = (buf.template view<DeviceType>().extent(0)*buf.template view<DeviceType>().extent(1))/_size_exchange;
 
     buffer_view<DeviceType>(_buf,buf,maxsendlist,_size_exchange);
@@ -875,15 +875,14 @@ struct AtomVecDPDKokkos_UnpackExchangeFunctor {
       const typename AT::tdual_xfloat_2d buf,
       typename AT::tdual_int_1d nlocal,
       int dim, X_FLOAT lo, X_FLOAT hi):
-                _size_exchange(atom->avecKK->size_exchange),
                 _x(atom->k_x.view<DeviceType>()),
                 _v(atom->k_v.view<DeviceType>()),
                 _tag(atom->k_tag.view<DeviceType>()),
                 _type(atom->k_type.view<DeviceType>()),
                 _mask(atom->k_mask.view<DeviceType>()),
                 _image(atom->k_image.view<DeviceType>()),
-                _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
-                _lo(lo),_hi(hi) {
+                _nlocal(nlocal.template view<DeviceType>()),
+                _dim(dim),_lo(lo),_hi(hi),_size_exchange(atom->avecKK->size_exchange) {
     const int maxsendlist = (buf.template view<DeviceType>().extent(0)*buf.template view<DeviceType>().extent(1))/_size_exchange;
 
     buffer_view<DeviceType>(_buf,buf,maxsendlist,_size_exchange);
@@ -917,7 +916,7 @@ struct AtomVecDPDKokkos_UnpackExchangeFunctor {
 /* ---------------------------------------------------------------------- */
 int AtomVecDPDKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv, int nlocal,
                                              int dim, X_FLOAT lo, X_FLOAT hi, ExecutionSpace space,
-                                             DAT::tdual_int_1d &k_indices)
+                                             DAT::tdual_int_1d &/*k_indices*/)
 {
   while (nlocal + nrecv/size_exchange >= nmax) grow(0);
 
diff --git a/src/KOKKOS/atom_vec_full_kokkos.cpp b/src/KOKKOS/atom_vec_full_kokkos.cpp
index 829ebc75e6..732078a627 100644
--- a/src/KOKKOS/atom_vec_full_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_full_kokkos.cpp
@@ -501,7 +501,6 @@ struct AtomVecFullKokkos_PackExchangeFunctor {
       const typename AT::tdual_xfloat_2d buf,
       typename AT::tdual_int_1d sendlist,
       typename AT::tdual_int_1d copylist):
-    _size_exchange(atom->avecKK->size_exchange),
     _x(atom->k_x.view<DeviceType>()),
     _v(atom->k_v.view<DeviceType>()),
     _tag(atom->k_tag.view<DeviceType>()),
@@ -563,7 +562,8 @@ struct AtomVecFullKokkos_PackExchangeFunctor {
     _improper_atom3w(atom->k_improper_atom3.view<DeviceType>()),
     _improper_atom4w(atom->k_improper_atom4.view<DeviceType>()),
     _sendlist(sendlist.template view<DeviceType>()),
-    _copylist(copylist.template view<DeviceType>()) {
+    _copylist(copylist.template view<DeviceType>()),
+    _size_exchange(atom->avecKK->size_exchange) {
     const int maxsendlist = (buf.template view<DeviceType>().extent(0)*
                              buf.template view<DeviceType>().extent(1))/_size_exchange;
     buffer_view<DeviceType>(_buf,buf,maxsendlist,_size_exchange);
@@ -755,14 +755,12 @@ struct AtomVecFullKokkos_UnpackExchangeFunctor {
     typename AT::tdual_int_1d nlocal,
     typename AT::tdual_int_1d indices,
     int dim, X_FLOAT lo, X_FLOAT hi):
-      _size_exchange(atom->avecKK->size_exchange),
       _x(atom->k_x.view<DeviceType>()),
       _v(atom->k_v.view<DeviceType>()),
       _tag(atom->k_tag.view<DeviceType>()),
       _type(atom->k_type.view<DeviceType>()),
       _mask(atom->k_mask.view<DeviceType>()),
       _image(atom->k_image.view<DeviceType>()),
-      _indices(indices.template view<DeviceType>()),
       _q(atom->k_q.view<DeviceType>()),
       _molecule(atom->k_molecule.view<DeviceType>()),
       _nspecial(atom->k_nspecial.view<DeviceType>()),
@@ -787,9 +785,9 @@ struct AtomVecFullKokkos_UnpackExchangeFunctor {
       _improper_atom2(atom->k_improper_atom2.view<DeviceType>()),
       _improper_atom3(atom->k_improper_atom3.view<DeviceType>()),
       _improper_atom4(atom->k_improper_atom4.view<DeviceType>()),
-      _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
-      _lo(lo),_hi(hi) {
-
+      _nlocal(nlocal.template view<DeviceType>()),
+      _indices(indices.template view<DeviceType>()),
+      _dim(dim),_lo(lo),_hi(hi),_size_exchange(atom->avecKK->size_exchange) {
     const int maxsendlist = (buf.template view<DeviceType>().extent(0)*
                              buf.template view<DeviceType>().extent(1))/_size_exchange;
     buffer_view<DeviceType>(_buf,buf,maxsendlist,_size_exchange);
diff --git a/src/KOKKOS/atom_vec_hybrid_kokkos.cpp b/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
index 4e01ab5794..08bcaaef74 100644
--- a/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
@@ -66,7 +66,7 @@ void AtomVecHybridKokkos::sort_kokkos(Kokkos::BinSort<KeyViewType, BinOp> &Sorte
 int AtomVecHybridKokkos::pack_comm_kokkos(const int &/*n*/, const DAT::tdual_int_2d &/*k_sendlist*/,
                                           const int & /*iswap*/,
                                           const DAT::tdual_xfloat_2d &/*buf*/,
-                                          const int &/*pbc_flag*/, const int pbc[])
+                                          const int &/*pbc_flag*/, const int /*pbc*/[])
 {
   error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
   return 0;
@@ -80,7 +80,7 @@ void AtomVecHybridKokkos::unpack_comm_kokkos(const int &/*n*/, const int &/*nfir
 
 int AtomVecHybridKokkos::pack_comm_self(const int &/*n*/, const DAT::tdual_int_2d &/*list*/,
                                         const int & /*iswap*/, const int /*nfirst*/,
-                                        const int &/*pbc_flag*/, const int pbc[])
+                                        const int &/*pbc_flag*/, const int /*pbc*/[])
 {
   error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
   return 0;
@@ -113,7 +113,7 @@ int AtomVecHybridKokkos::pack_exchange_kokkos(const int &/*nsend*/,DAT::tdual_xf
 int AtomVecHybridKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d & /*k_buf*/, int /*nrecv*/,
                                                 int /*nlocal*/, int /*dim*/, X_FLOAT /*lo*/,
                                                 X_FLOAT /*hi*/, ExecutionSpace /*space*/,
-                                                DAT::tdual_int_1d &k_indices)
+                                                DAT::tdual_int_1d &/*k_indices*/)
 {
   error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
   return 0;
diff --git a/src/KOKKOS/atom_vec_molecular_kokkos.cpp b/src/KOKKOS/atom_vec_molecular_kokkos.cpp
index 471dd0ad58..ec98ff9239 100644
--- a/src/KOKKOS/atom_vec_molecular_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_molecular_kokkos.cpp
@@ -762,7 +762,6 @@ struct AtomVecMolecularKokkos_PackExchangeFunctor {
     const typename AT::tdual_xfloat_2d buf,
     typename AT::tdual_int_1d sendlist,
     typename AT::tdual_int_1d copylist):
-      _size_exchange(atom->avecKK->size_exchange),
       _x(atom->k_x.view<DeviceType>()),
       _v(atom->k_v.view<DeviceType>()),
       _tag(atom->k_tag.view<DeviceType>()),
@@ -822,7 +821,8 @@ struct AtomVecMolecularKokkos_PackExchangeFunctor {
       _improper_atom3w(atom->k_improper_atom3.view<DeviceType>()),
       _improper_atom4w(atom->k_improper_atom4.view<DeviceType>()),
       _sendlist(sendlist.template view<DeviceType>()),
-      _copylist(copylist.template view<DeviceType>()) {
+      _copylist(copylist.template view<DeviceType>()),
+      _size_exchange(atom->avecKK->size_exchange) {
     const int maxsendlist = (buf.template view<DeviceType>().extent(0)*
                              buf.template view<DeviceType>().extent(1))/_size_exchange;
     buffer_view<DeviceType>(_buf,buf,maxsendlist,_size_exchange);
@@ -1010,7 +1010,6 @@ struct AtomVecMolecularKokkos_UnpackExchangeFunctor {
     typename AT::tdual_int_1d nlocal,
     typename AT::tdual_int_1d indices,
     int dim, X_FLOAT lo, X_FLOAT hi):
-      _size_exchange(atom->avecKK->size_exchange),
       _x(atom->k_x.view<DeviceType>()),
       _v(atom->k_v.view<DeviceType>()),
       _tag(atom->k_tag.view<DeviceType>()),
@@ -1040,9 +1039,9 @@ struct AtomVecMolecularKokkos_UnpackExchangeFunctor {
       _improper_atom2(atom->k_improper_atom2.view<DeviceType>()),
       _improper_atom3(atom->k_improper_atom3.view<DeviceType>()),
       _improper_atom4(atom->k_improper_atom4.view<DeviceType>()),
+      _nlocal(nlocal.template view<DeviceType>()),
       _indices(indices.template view<DeviceType>()),
-      _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
-      _lo(lo),_hi(hi) {
+      _dim(dim),_lo(lo),_hi(hi),_size_exchange(atom->avecKK->size_exchange) {
         const int maxsendlist = (buf.template view<DeviceType>().extent(0)*
                                  buf.template view<DeviceType>().extent(1))/_size_exchange;
         buffer_view<DeviceType>(_buf,buf,maxsendlist,_size_exchange);
diff --git a/src/KOKKOS/atom_vec_sphere_kokkos.cpp b/src/KOKKOS/atom_vec_sphere_kokkos.cpp
index 5a1c2beee3..3dfb5143cd 100644
--- a/src/KOKKOS/atom_vec_sphere_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_sphere_kokkos.cpp
@@ -1448,7 +1448,6 @@ struct AtomVecSphereKokkos_PackExchangeFunctor {
     const typename AT::tdual_xfloat_2d buf,
     typename AT::tdual_int_1d sendlist,
     typename AT::tdual_int_1d copylist):
-    _size_exchange(atom->avecKK->size_exchange),
     _x(atom->k_x.view<DeviceType>()),
     _v(atom->k_v.view<DeviceType>()),
     _tag(atom->k_tag.view<DeviceType>()),
@@ -1468,7 +1467,8 @@ struct AtomVecSphereKokkos_PackExchangeFunctor {
     _rmassw(atom->k_rmass.view<DeviceType>()),
     _omegaw(atom->k_omega.view<DeviceType>()),
     _sendlist(sendlist.template view<DeviceType>()),
-    _copylist(copylist.template view<DeviceType>()) {
+    _copylist(copylist.template view<DeviceType>()),
+    _size_exchange(atom->avecKK->size_exchange) {
     const int maxsend = (buf.template view<DeviceType>().extent(0)*buf.template view<DeviceType>().extent(1))/_size_exchange;
 
     _buf = typename AT::t_xfloat_2d_um(buf.template view<DeviceType>().data(),maxsend,_size_exchange);
@@ -1572,7 +1572,6 @@ struct AtomVecSphereKokkos_UnpackExchangeFunctor {
     typename AT::tdual_int_1d nlocal,
     typename AT::tdual_int_1d indices,
     int dim, X_FLOAT lo, X_FLOAT hi):
-      _size_exchange(atom->avecKK->size_exchange),
       _x(atom->k_x.view<DeviceType>()),
       _v(atom->k_v.view<DeviceType>()),
       _tag(atom->k_tag.view<DeviceType>()),
@@ -1584,9 +1583,7 @@ struct AtomVecSphereKokkos_UnpackExchangeFunctor {
       _omega(atom->k_omega.view<DeviceType>()),
       _nlocal(nlocal.template view<DeviceType>()),
       _indices(indices.template view<DeviceType>()),
-      _dim(dim),
-      _lo(lo),_hi(hi)
-  {
+      _dim(dim),_lo(lo),_hi(hi),_size_exchange(atom->avecKK->size_exchange) {
     const size_t size_exchange = 16;
     const int maxsendlist = (buf.template view<DeviceType>().extent(0)*buf.template view<DeviceType>().extent(1))/size_exchange;
 
diff --git a/src/KOKKOS/atom_vec_spin_kokkos.cpp b/src/KOKKOS/atom_vec_spin_kokkos.cpp
index d2dd3a05ab..72d38a731e 100644
--- a/src/KOKKOS/atom_vec_spin_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_spin_kokkos.cpp
@@ -410,7 +410,6 @@ struct AtomVecSpinKokkos_PackExchangeFunctor {
       const typename AT::tdual_xfloat_2d buf,
       typename AT::tdual_int_1d sendlist,
       typename AT::tdual_int_1d copylist):
-    _size_exchange(atom->avecKK->size_exchange),
     _x(atom->k_x.view<DeviceType>()),
     _v(atom->k_v.view<DeviceType>()),
     _tag(atom->k_tag.view<DeviceType>()),
@@ -426,7 +425,8 @@ struct AtomVecSpinKokkos_PackExchangeFunctor {
     _imagew(atom->k_image.view<DeviceType>()),
     _spw(atom->k_sp.view<DeviceType>()),
     _sendlist(sendlist.template view<DeviceType>()),
-    _copylist(copylist.template view<DeviceType>()) {
+    _copylist(copylist.template view<DeviceType>()),
+    _size_exchange(atom->avecKK->size_exchange) {
     const int maxsendlist = (buf.template view<DeviceType>().extent(0)*
                              buf.template view<DeviceType>().extent(1))/_size_exchange;
     buffer_view<DeviceType>(_buf,buf,maxsendlist,_size_exchange);
@@ -521,7 +521,6 @@ struct AtomVecSpinKokkos_UnpackExchangeFunctor {
       const typename AT::tdual_xfloat_2d buf,
       typename AT::tdual_int_1d nlocal,
       int dim, X_FLOAT lo, X_FLOAT hi):
-      _size_exchange(atom->avecKK->size_exchange),
       _x(atom->k_x.view<DeviceType>()),
       _v(atom->k_v.view<DeviceType>()),
       _tag(atom->k_tag.view<DeviceType>()),
@@ -529,8 +528,8 @@ struct AtomVecSpinKokkos_UnpackExchangeFunctor {
       _mask(atom->k_mask.view<DeviceType>()),
       _image(atom->k_image.view<DeviceType>()),
       _sp(atom->k_sp.view<DeviceType>()),
-      _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
-      _lo(lo),_hi(hi) {
+      _nlocal(nlocal.template view<DeviceType>()),
+      _dim(dim),_lo(lo),_hi(hi),_size_exchange(atom->avecKK->size_exchange) {
     const int maxsendlist = (buf.template view<DeviceType>().extent(0)*buf.template view<DeviceType>().extent(1))/_size_exchange;
 
     buffer_view<DeviceType>(_buf,buf,maxsendlist,_size_exchange);
@@ -563,7 +562,7 @@ struct AtomVecSpinKokkos_UnpackExchangeFunctor {
 
 int AtomVecSpinKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv, int nlocal,
                                               int dim, X_FLOAT lo, X_FLOAT hi, ExecutionSpace space,
-                                              DAT::tdual_int_1d &k_indices)
+                                              DAT::tdual_int_1d &/*k_indices*/)
 {
   while (nlocal + nrecv/size_exchange >= nmax) grow(0);
 
@@ -592,7 +591,7 @@ int AtomVecSpinKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int n
    include f b/c this is invoked from within SPIN pair styles
 ------------------------------------------------------------------------- */
 
-void AtomVecSpinKokkos::force_clear(int n, size_t nbytes)
+void AtomVecSpinKokkos::force_clear(int /*n*/, size_t nbytes)
 {
   int nzero = (double)nbytes/sizeof(double);
 
diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp
index b586dca7a5..2f1818e47f 100644
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@@ -864,7 +864,7 @@ void CommKokkos::exchange_device()
         if (nrecv) {
 
           if (atom->nextra_grow) {
-            if (k_indices.extent(0) < nrecv/data_size)
+            if ((int) k_indices.extent(0) < nrecv/data_size)
               MemoryKokkos::realloc_kokkos(k_indices,"comm:indices",nrecv/data_size);
           } else if (k_indices.h_view.data())
            k_indices = DAT::tdual_int_1d();
@@ -931,6 +931,7 @@ void CommKokkos::exchange_device()
             if (nextrarecv) {
               kkbase->unpack_exchange_kokkos(
                 k_buf_recv,k_indices,nrecv/data_size,
+                nrecv1/data_size,nextrarecv1,
                 ExecutionSpaceFromDevice<DeviceType>::space);
               DeviceType().fence();
             }
diff --git a/src/KOKKOS/compute_reaxff_atom_kokkos.cpp b/src/KOKKOS/compute_reaxff_atom_kokkos.cpp
index 8dbcb9441e..3f6c9242d4 100644
--- a/src/KOKKOS/compute_reaxff_atom_kokkos.cpp
+++ b/src/KOKKOS/compute_reaxff_atom_kokkos.cpp
@@ -87,7 +87,7 @@ void ComputeReaxFFAtomKokkos<DeviceType>::compute_bonds()
 
   nbuf = ((store_bonds ? maxnumbonds*2 : 0) + 3)*nlocal;
 
-  if (!buf || k_buf.extent(0) < nbuf) {
+  if (!buf || ((int)k_buf.extent(0) < nbuf)) {
     memoryKK->destroy_kokkos(k_buf, buf);
     memoryKK->create_kokkos(k_buf, buf, nbuf, "reaxff/atom:buf");
   }
diff --git a/src/KOKKOS/dihedral_charmmfsw_kokkos.cpp b/src/KOKKOS/dihedral_charmmfsw_kokkos.cpp
new file mode 100644
index 0000000000..1caea90a74
--- /dev/null
+++ b/src/KOKKOS/dihedral_charmmfsw_kokkos.cpp
@@ -0,0 +1,815 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+
+   Contributing author: Mitch Murphy (alphataubio)
+
+   Based on serial dihedral_charmmfsw.cpp lj-fsw sections (force-switched)
+   provided by Robert Meissner and Lucio Colombi Ciacchi of Bremen
+   University, Germany, with additional assistance from
+   Robert A. Latour, Clemson University.
+
+------------------------------------------------------------------------- */
+
+#include "dihedral_charmmfsw_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "error.h"
+#include "force.h"
+#include "kokkos.h"
+#include "math_const.h"
+#include "memory_kokkos.h"
+#include "neighbor_kokkos.h"
+#include "pair.h"
+
+#include <cmath>
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define TOLERANCE 0.05
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+DihedralCharmmfswKokkos<DeviceType>::DihedralCharmmfswKokkos(LAMMPS *lmp) : DihedralCharmmfsw(lmp)
+{
+  atomKK = (AtomKokkos *) atom;
+  neighborKK = (NeighborKokkos *) neighbor;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | F_MASK | Q_MASK | ENERGY_MASK | VIRIAL_MASK | TYPE_MASK;
+  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+
+  k_warning_flag = Kokkos::DualView<int,DeviceType>("Dihedral:warning_flag");
+  d_warning_flag = k_warning_flag.template view<DeviceType>();
+  h_warning_flag = k_warning_flag.h_view;
+
+  centroidstressflag = CENTROID_NOTAVAIL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+DihedralCharmmfswKokkos<DeviceType>::~DihedralCharmmfswKokkos()
+{
+  if (!copymode) {
+    memoryKK->destroy_kokkos(k_eatom,eatom);
+    memoryKK->destroy_kokkos(k_vatom,vatom);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void DihedralCharmmfswKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+  if (lmp->kokkos->neighflag == FULL)
+    error->all(FLERR,"Dihedral_style charmm/kk requires half neighbor list");
+
+  ev_init(eflag,vflag,0);
+
+  // ensure pair->ev_tally() will use 1-4 virial contribution
+
+  if (weightflag && vflag_global == VIRIAL_FDOTR)
+    force->pair->vflag_either = force->pair->vflag_global = 1;
+
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    //if(k_eatom.extent(0)<maxeatom) { // won't work without adding zero functor
+      memoryKK->destroy_kokkos(k_eatom,eatom);
+      memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"dihedral:eatom");
+      d_eatom = k_eatom.template view<KKDeviceType>();
+      k_eatom_pair = Kokkos::DualView<E_FLOAT*,Kokkos::LayoutRight,KKDeviceType>("dihedral:eatom_pair",maxeatom);
+      d_eatom_pair = k_eatom_pair.template view<KKDeviceType>();
+    //}
+  }
+  if (vflag_atom) {
+    //if(k_vatom.extent(0)<maxvatom) { // won't work without adding zero functor
+      memoryKK->destroy_kokkos(k_vatom,vatom);
+      memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"dihedral:vatom");
+      d_vatom = k_vatom.template view<KKDeviceType>();
+      k_vatom_pair = Kokkos::DualView<F_FLOAT*[6],Kokkos::LayoutRight,KKDeviceType>("dihedral:vatom_pair",maxvatom);
+      d_vatom_pair = k_vatom_pair.template view<KKDeviceType>();
+    //}
+  }
+
+  x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  q = atomKK->k_q.view<DeviceType>();
+  atomtype = atomKK->k_type.view<DeviceType>();
+  neighborKK->k_dihedrallist.template sync<DeviceType>();
+  dihedrallist = neighborKK->k_dihedrallist.view<DeviceType>();
+  int ndihedrallist = neighborKK->ndihedrallist;
+  nlocal = atom->nlocal;
+  newton_bond = force->newton_bond;
+  qqrd2e = force->qqrd2e;
+
+  h_warning_flag() = 0;
+  k_warning_flag.template modify<LMPHostType>();
+  k_warning_flag.template sync<DeviceType>();
+
+  copymode = 1;
+
+  // loop over neighbors of my atoms
+
+  EVM_FLOAT evm;
+
+  if (evflag) {
+    if (newton_bond) {
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagDihedralCharmmfswCompute<1,1> >(0,ndihedrallist),*this,evm);
+    } else {
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagDihedralCharmmfswCompute<0,1> >(0,ndihedrallist),*this,evm);
+    }
+  } else {
+    if (newton_bond) {
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagDihedralCharmmfswCompute<1,0> >(0,ndihedrallist),*this);
+    } else {
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagDihedralCharmmfswCompute<0,0> >(0,ndihedrallist),*this);
+    }
+  }
+
+  // error check
+
+  k_warning_flag.template modify<DeviceType>();
+  k_warning_flag.template sync<LMPHostType>();
+  if (h_warning_flag())
+    error->warning(FLERR,"Dihedral problem");
+
+  if (eflag_global) {
+    energy += evm.emol;
+    force->pair->eng_vdwl += evm.evdwl;
+    force->pair->eng_coul += evm.ecoul;
+  }
+  if (vflag_global) {
+    virial[0] += evm.v[0];
+    virial[1] += evm.v[1];
+    virial[2] += evm.v[2];
+    virial[3] += evm.v[3];
+    virial[4] += evm.v[4];
+    virial[5] += evm.v[5];
+
+    force->pair->virial[0] += evm.vp[0];
+    force->pair->virial[1] += evm.vp[1];
+    force->pair->virial[2] += evm.vp[2];
+    force->pair->virial[3] += evm.vp[3];
+    force->pair->virial[4] += evm.vp[4];
+    force->pair->virial[5] += evm.vp[5];
+  }
+
+  // don't yet have dualviews for eatom and vatom in pair_kokkos,
+  //  so need to manually copy these to pair style
+
+  int n = nlocal;
+  if (newton_bond) n += atom->nghost;
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+
+    k_eatom_pair.template modify<DeviceType>();
+    k_eatom_pair.template sync<LMPHostType>();
+    for (int i = 0; i < n; i++)
+      force->pair->eatom[i] += k_eatom_pair.h_view(i);
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+
+    k_vatom_pair.template modify<DeviceType>();
+    k_vatom_pair.template sync<LMPHostType>();
+    for (int i = 0; i < n; i++) {
+      force->pair->vatom[i][0] += k_vatom_pair.h_view(i,0);
+      force->pair->vatom[i][1] += k_vatom_pair.h_view(i,1);
+      force->pair->vatom[i][2] += k_vatom_pair.h_view(i,2);
+      force->pair->vatom[i][3] += k_vatom_pair.h_view(i,3);
+      force->pair->vatom[i][4] += k_vatom_pair.h_view(i,4);
+      force->pair->vatom[i][5] += k_vatom_pair.h_view(i,5);
+    }
+  }
+
+  copymode = 0;
+}
+
+template<class DeviceType>
+template<int NEWTON_BOND, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void DihedralCharmmfswKokkos<DeviceType>::operator()(TagDihedralCharmmfswCompute<NEWTON_BOND,EVFLAG>, const int &n, EVM_FLOAT& evm) const {
+
+  // The f array is atomic
+  Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,typename KKDevice<DeviceType>::value,Kokkos::MemoryTraits<Kokkos::Atomic|Kokkos::Unmanaged> > a_f = f;
+
+  const int i1 = dihedrallist(n,0);
+  const int i2 = dihedrallist(n,1);
+  const int i3 = dihedrallist(n,2);
+  const int i4 = dihedrallist(n,3);
+  const int type = dihedrallist(n,4);
+
+  // 1st bond
+
+  const F_FLOAT vb1x = x(i1,0) - x(i2,0);
+  const F_FLOAT vb1y = x(i1,1) - x(i2,1);
+  const F_FLOAT vb1z = x(i1,2) - x(i2,2);
+
+  // 2nd bond
+
+  const F_FLOAT vb2x = x(i3,0) - x(i2,0);
+  const F_FLOAT vb2y = x(i3,1) - x(i2,1);
+  const F_FLOAT vb2z = x(i3,2) - x(i2,2);
+
+  const F_FLOAT vb2xm = -vb2x;
+  const F_FLOAT vb2ym = -vb2y;
+  const F_FLOAT vb2zm = -vb2z;
+
+  // 3rd bond
+
+  const F_FLOAT vb3x = x(i4,0) - x(i3,0);
+  const F_FLOAT vb3y = x(i4,1) - x(i3,1);
+  const F_FLOAT vb3z = x(i4,2) - x(i3,2);
+
+  const F_FLOAT ax = vb1y*vb2zm - vb1z*vb2ym;
+  const F_FLOAT ay = vb1z*vb2xm - vb1x*vb2zm;
+  const F_FLOAT az = vb1x*vb2ym - vb1y*vb2xm;
+  const F_FLOAT bx = vb3y*vb2zm - vb3z*vb2ym;
+  const F_FLOAT by = vb3z*vb2xm - vb3x*vb2zm;
+  const F_FLOAT bz = vb3x*vb2ym - vb3y*vb2xm;
+
+  const F_FLOAT rasq = ax*ax + ay*ay + az*az;
+  const F_FLOAT rbsq = bx*bx + by*by + bz*bz;
+  const F_FLOAT rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
+  const F_FLOAT rg = sqrt(rgsq);
+
+  F_FLOAT rginv,ra2inv,rb2inv;
+  rginv = ra2inv = rb2inv = 0.0;
+  if (rg > 0) rginv = 1.0/rg;
+  if (rasq > 0) ra2inv = 1.0/rasq;
+  if (rbsq > 0) rb2inv = 1.0/rbsq;
+  const F_FLOAT rabinv = sqrt(ra2inv*rb2inv);
+
+  F_FLOAT c = (ax*bx + ay*by + az*bz)*rabinv;
+  F_FLOAT s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
+
+    // error check
+
+  if ((c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) && !d_warning_flag())
+    d_warning_flag() = 1;
+
+  if (c > 1.0) c = 1.0;
+  if (c < -1.0) c = -1.0;
+
+  const int m = d_multiplicity[type];
+  F_FLOAT p = 1.0;
+  F_FLOAT ddf1,df1;
+  ddf1 = df1 = 0.0;
+
+  for (int i = 0; i < m; i++) {
+    ddf1 = p*c - df1*s;
+    df1 = p*s + df1*c;
+    p = ddf1;
+  }
+
+  p = p*d_cos_shift[type] + df1*d_sin_shift[type];
+  df1 = df1*d_cos_shift[type] - ddf1*d_sin_shift[type];
+  df1 *= -m;
+  p += 1.0;
+
+  if (m == 0) {
+    p = 1.0 + d_cos_shift[type];
+    df1 = 0.0;
+  }
+
+  E_FLOAT edihedral = 0.0;
+  if (eflag) edihedral = d_k[type] * p;
+
+  const F_FLOAT fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
+  const F_FLOAT hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
+  const F_FLOAT fga = fg*ra2inv*rginv;
+  const F_FLOAT hgb = hg*rb2inv*rginv;
+  const F_FLOAT gaa = -ra2inv*rg;
+  const F_FLOAT gbb = rb2inv*rg;
+
+  const F_FLOAT dtfx = gaa*ax;
+  const F_FLOAT dtfy = gaa*ay;
+  const F_FLOAT dtfz = gaa*az;
+  const F_FLOAT dtgx = fga*ax - hgb*bx;
+  const F_FLOAT dtgy = fga*ay - hgb*by;
+  const F_FLOAT dtgz = fga*az - hgb*bz;
+  const F_FLOAT dthx = gbb*bx;
+  const F_FLOAT dthy = gbb*by;
+  const F_FLOAT dthz = gbb*bz;
+
+  const F_FLOAT df = -d_k[type] * df1;
+
+  const F_FLOAT sx2 = df*dtgx;
+  const F_FLOAT sy2 = df*dtgy;
+  const F_FLOAT sz2 = df*dtgz;
+
+  F_FLOAT f1[3],f2[3],f3[3],f4[3];
+  f1[0] = df*dtfx;
+  f1[1] = df*dtfy;
+  f1[2] = df*dtfz;
+
+  f2[0] = sx2 - f1[0];
+  f2[1] = sy2 - f1[1];
+  f2[2] = sz2 - f1[2];
+
+  f4[0] = df*dthx;
+  f4[1] = df*dthy;
+  f4[2] = df*dthz;
+
+  f3[0] = -sx2 - f4[0];
+  f3[1] = -sy2 - f4[1];
+  f3[2] = -sz2 - f4[2];
+
+  // apply force to each of 4 atoms
+
+  if (NEWTON_BOND || i1 < nlocal) {
+    a_f(i1,0) += f1[0];
+    a_f(i1,1) += f1[1];
+    a_f(i1,2) += f1[2];
+  }
+
+  if (NEWTON_BOND || i2 < nlocal) {
+    a_f(i2,0) += f2[0];
+    a_f(i2,1) += f2[1];
+    a_f(i2,2) += f2[2];
+  }
+
+  if (NEWTON_BOND || i3 < nlocal) {
+    a_f(i3,0) += f3[0];
+    a_f(i3,1) += f3[1];
+    a_f(i3,2) += f3[2];
+  }
+
+  if (NEWTON_BOND || i4 < nlocal) {
+    a_f(i4,0) += f4[0];
+    a_f(i4,1) += f4[1];
+    a_f(i4,2) += f4[2];
+  }
+
+  if (EVFLAG)
+    ev_tally(evm,i1,i2,i3,i4,edihedral,f1,f3,f4,
+             vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z);
+
+  // 1-4 LJ and Coulomb interactions
+  // tally energy/virial in pair, using newton_bond as newton flag
+
+  if (d_weight[type] > 0.0) {
+    const int itype = atomtype[i1];
+    const int jtype = atomtype[i4];
+
+    const F_FLOAT delx = x(i1,0) - x(i4,0);
+    const F_FLOAT dely = x(i1,1) - x(i4,1);
+    const F_FLOAT delz = x(i1,2) - x(i4,2);
+    const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+    const F_FLOAT r2inv = 1.0/rsq;
+    const F_FLOAT r6inv = r2inv*r2inv*r2inv;
+
+    F_FLOAT forcecoul;
+    if (implicit) forcecoul = qqrd2e * q[i1]*q[i4]*r2inv;
+    else forcecoul = qqrd2e * q[i1]*q[i4]*sqrt(r2inv);
+    const F_FLOAT forcelj = r6inv * (d_lj14_1(itype,jtype)*r6inv - d_lj14_2(itype,jtype));
+    const F_FLOAT fpair = d_weight[type] * (forcelj+forcecoul)*r2inv;
+
+    const F_FLOAT r = sqrt(rsq);
+    F_FLOAT ecoul = 0.0;
+    F_FLOAT evdwl = 0.0;
+    F_FLOAT evdwl14_12, evdwl14_6;
+    if (eflag) {
+      if (dihedflag)
+        ecoul = d_weight[type] * forcecoul;
+      else
+        ecoul = d_weight[type] * qqrd2e * q[i1] * q[i4] *
+          (sqrt(r2inv) + r * cut_coulinv14 * cut_coulinv14 - 2.0 * cut_coulinv14);
+      evdwl14_12 = r6inv * d_lj14_3(itype,jtype) * r6inv -
+        d_lj14_3(itype,jtype) * cut_lj_inner6inv * cut_lj6inv;
+      evdwl14_6 =
+        -d_lj14_4(itype,jtype) * r6inv + d_lj14_4(itype,jtype) * cut_lj_inner3inv * cut_lj3inv;
+      evdwl = evdwl14_12 + evdwl14_6;
+      evdwl *= d_weight[type];
+    }
+
+    if (newton_bond || i1 < nlocal) {
+      a_f(i1,0) += delx*fpair;
+      a_f(i1,1) += dely*fpair;
+      a_f(i1,2) += delz*fpair;
+    }
+    if (newton_bond || i4 < nlocal) {
+      a_f(i4,0) -= delx*fpair;
+      a_f(i4,1) -= dely*fpair;
+      a_f(i4,2) -= delz*fpair;
+    }
+
+    if (EVFLAG) ev_tally(evm,i1,i4,evdwl,ecoul,fpair,delx,dely,delz);
+  }
+}
+
+template<class DeviceType>
+template<int NEWTON_BOND, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void DihedralCharmmfswKokkos<DeviceType>::operator()(TagDihedralCharmmfswCompute<NEWTON_BOND,EVFLAG>, const int &n) const {
+  EVM_FLOAT evm;
+  this->template operator()<NEWTON_BOND,EVFLAG>(TagDihedralCharmmfswCompute<NEWTON_BOND,EVFLAG>(), n, evm);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void DihedralCharmmfswKokkos<DeviceType>::allocate()
+{
+  DihedralCharmmfsw::allocate();
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more types
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void DihedralCharmmfswKokkos<DeviceType>::coeff(int narg, char **arg)
+{
+  DihedralCharmmfsw::coeff(narg, arg);
+
+  int nd = atom->ndihedraltypes;
+  typename AT::tdual_ffloat_1d k_k("DihedralCharmmfsw::k",nd+1);
+  typename AT::tdual_ffloat_1d k_multiplicity("DihedralCharmmfsw::multiplicity",nd+1);
+  typename AT::tdual_ffloat_1d k_shift("DihedralCharmmfsw::shift",nd+1);
+  typename AT::tdual_ffloat_1d k_cos_shift("DihedralCharmmfsw::cos_shift",nd+1);
+  typename AT::tdual_ffloat_1d k_sin_shift("DihedralCharmmfsw::sin_shift",nd+1);
+  typename AT::tdual_ffloat_1d k_weight("DihedralCharmmfsw::weight",nd+1);
+
+  d_k = k_k.template view<DeviceType>();
+  d_multiplicity = k_multiplicity.template view<DeviceType>();
+  d_shift = k_shift.template view<DeviceType>();
+  d_cos_shift = k_cos_shift.template view<DeviceType>();
+  d_sin_shift = k_sin_shift.template view<DeviceType>();
+  d_weight = k_weight.template view<DeviceType>();
+
+  int n = atom->ndihedraltypes;
+  for (int i = 1; i <= n; i++) {
+    k_k.h_view[i] = k[i];
+    k_multiplicity.h_view[i] = multiplicity[i];
+    k_shift.h_view[i] = shift[i];
+    k_cos_shift.h_view[i] = cos_shift[i];
+    k_sin_shift.h_view[i] = sin_shift[i];
+    k_weight.h_view[i] = weight[i];
+  }
+
+  k_k.template modify<LMPHostType>();
+  k_multiplicity.template modify<LMPHostType>();
+  k_shift.template modify<LMPHostType>();
+  k_cos_shift.template modify<LMPHostType>();
+  k_sin_shift.template modify<LMPHostType>();
+  k_weight.template modify<LMPHostType>();
+
+  k_k.template sync<DeviceType>();
+  k_multiplicity.template sync<DeviceType>();
+  k_shift.template sync<DeviceType>();
+  k_cos_shift.template sync<DeviceType>();
+  k_sin_shift.template sync<DeviceType>();
+  k_weight.template sync<DeviceType>();
+}
+
+/* ----------------------------------------------------------------------
+   error check and initialize all values needed for force computation
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void DihedralCharmmfswKokkos<DeviceType>::init_style()
+{
+  DihedralCharmmfsw::init_style();
+
+  int n = atom->ntypes;
+  DAT::tdual_ffloat_2d k_lj14_1("DihedralCharmmfsw:lj14_1",n+1,n+1);
+  DAT::tdual_ffloat_2d k_lj14_2("DihedralCharmmfsw:lj14_2",n+1,n+1);
+  DAT::tdual_ffloat_2d k_lj14_3("DihedralCharmmfsw:lj14_3",n+1,n+1);
+  DAT::tdual_ffloat_2d k_lj14_4("DihedralCharmmfsw:lj14_4",n+1,n+1);
+
+  d_lj14_1 = k_lj14_1.template view<DeviceType>();
+  d_lj14_2 = k_lj14_2.template view<DeviceType>();
+  d_lj14_3 = k_lj14_3.template view<DeviceType>();
+  d_lj14_4 = k_lj14_4.template view<DeviceType>();
+
+
+  if (weightflag) {
+    int n = atom->ntypes;
+    for (int i = 1; i <= n; i++) {
+      for (int j = 1; j <= n; j++) {
+        k_lj14_1.h_view(i,j) = lj14_1[i][j];
+        k_lj14_2.h_view(i,j) = lj14_2[i][j];
+        k_lj14_3.h_view(i,j) = lj14_3[i][j];
+        k_lj14_4.h_view(i,j) = lj14_4[i][j];
+      }
+    }
+  }
+
+  k_lj14_1.template modify<LMPHostType>();
+  k_lj14_2.template modify<LMPHostType>();
+  k_lj14_3.template modify<LMPHostType>();
+  k_lj14_4.template modify<LMPHostType>();
+
+  k_lj14_1.template sync<DeviceType>();
+  k_lj14_2.template sync<DeviceType>();
+  k_lj14_3.template sync<DeviceType>();
+  k_lj14_4.template sync<DeviceType>();
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads coeffs from restart file, bcasts them
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void DihedralCharmmfswKokkos<DeviceType>::read_restart(FILE *fp)
+{
+  DihedralCharmmfsw::read_restart(fp);
+
+  int nd = atom->ndihedraltypes;
+  typename AT::tdual_ffloat_1d k_k("DihedralCharmmfsw::k",nd+1);
+  typename AT::tdual_ffloat_1d k_multiplicity("DihedralCharmmfsw::multiplicity",nd+1);
+  typename AT::tdual_ffloat_1d k_shift("DihedralCharmmfsw::shift",nd+1);
+  typename AT::tdual_ffloat_1d k_cos_shift("DihedralCharmmfsw::cos_shift",nd+1);
+  typename AT::tdual_ffloat_1d k_sin_shift("DihedralCharmmfsw::sin_shift",nd+1);
+  typename AT::tdual_ffloat_1d k_weight("DihedralCharmmfsw::weight",nd+1);
+
+  d_k = k_k.template view<DeviceType>();
+  d_multiplicity = k_multiplicity.template view<DeviceType>();
+  d_shift = k_shift.template view<DeviceType>();
+  d_cos_shift = k_cos_shift.template view<DeviceType>();
+  d_sin_shift = k_sin_shift.template view<DeviceType>();
+  d_weight = k_weight.template view<DeviceType>();
+
+  int n = atom->ndihedraltypes;
+  for (int i = 1; i <= n; i++) {
+    k_k.h_view[i] = k[i];
+    k_multiplicity.h_view[i] = multiplicity[i];
+    k_shift.h_view[i] = shift[i];
+    k_cos_shift.h_view[i] = cos_shift[i];
+    k_sin_shift.h_view[i] = sin_shift[i];
+    k_weight.h_view[i] = weight[i];
+  }
+
+  k_k.template modify<LMPHostType>();
+  k_multiplicity.template modify<LMPHostType>();
+  k_shift.template modify<LMPHostType>();
+  k_cos_shift.template modify<LMPHostType>();
+  k_sin_shift.template modify<LMPHostType>();
+  k_weight.template modify<LMPHostType>();
+
+  k_k.template sync<DeviceType>();
+  k_multiplicity.template sync<DeviceType>();
+  k_shift.template sync<DeviceType>();
+  k_cos_shift.template sync<DeviceType>();
+  k_sin_shift.template sync<DeviceType>();
+  k_weight.template sync<DeviceType>();
+}
+
+/* ----------------------------------------------------------------------
+   tally energy and virial into global and per-atom accumulators
+   virial = r1F1 + r2F2 + r3F3 + r4F4 = (r1-r2) F1 + (r3-r2) F3 + (r4-r2) F4
+          = (r1-r2) F1 + (r3-r2) F3 + (r4-r3 + r3-r2) F4
+          = vb1*f1 + vb2*f3 + (vb3+vb2)*f4
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+//template<int NEWTON_BOND>
+KOKKOS_INLINE_FUNCTION
+void DihedralCharmmfswKokkos<DeviceType>::ev_tally(EVM_FLOAT &evm, const int i1, const int i2, const int i3, const int i4,
+                        F_FLOAT &edihedral, F_FLOAT *f1, F_FLOAT *f3, F_FLOAT *f4,
+                        const F_FLOAT &vb1x, const F_FLOAT &vb1y, const F_FLOAT &vb1z,
+                        const F_FLOAT &vb2x, const F_FLOAT &vb2y, const F_FLOAT &vb2z,
+                        const F_FLOAT &vb3x, const F_FLOAT &vb3y, const F_FLOAT &vb3z) const
+{
+  E_FLOAT edihedralquarter;
+  F_FLOAT v[6];
+
+  if (eflag_either) {
+    if (eflag_global) {
+      if (newton_bond) evm.emol += edihedral;
+      else {
+        edihedralquarter = 0.25*edihedral;
+        if (i1 < nlocal) evm.emol += edihedralquarter;
+        if (i2 < nlocal) evm.emol += edihedralquarter;
+        if (i3 < nlocal) evm.emol += edihedralquarter;
+        if (i4 < nlocal) evm.emol += edihedralquarter;
+      }
+    }
+    if (eflag_atom) {
+      edihedralquarter = 0.25*edihedral;
+      if (newton_bond || i1 < nlocal) d_eatom[i1] += edihedralquarter;
+      if (newton_bond || i2 < nlocal) d_eatom[i2] += edihedralquarter;
+      if (newton_bond || i3 < nlocal) d_eatom[i3] += edihedralquarter;
+      if (newton_bond || i4 < nlocal) d_eatom[i4] += edihedralquarter;
+    }
+  }
+
+  if (vflag_either) {
+    v[0] = vb1x*f1[0] + vb2x*f3[0] + (vb3x+vb2x)*f4[0];
+    v[1] = vb1y*f1[1] + vb2y*f3[1] + (vb3y+vb2y)*f4[1];
+    v[2] = vb1z*f1[2] + vb2z*f3[2] + (vb3z+vb2z)*f4[2];
+    v[3] = vb1x*f1[1] + vb2x*f3[1] + (vb3x+vb2x)*f4[1];
+    v[4] = vb1x*f1[2] + vb2x*f3[2] + (vb3x+vb2x)*f4[2];
+    v[5] = vb1y*f1[2] + vb2y*f3[2] + (vb3y+vb2y)*f4[2];
+
+    if (vflag_global) {
+      if (newton_bond) {
+        evm.v[0] += v[0];
+        evm.v[1] += v[1];
+        evm.v[2] += v[2];
+        evm.v[3] += v[3];
+        evm.v[4] += v[4];
+        evm.v[5] += v[5];
+      } else {
+        if (i1 < nlocal) {
+          evm.v[0] += 0.25*v[0];
+          evm.v[1] += 0.25*v[1];
+          evm.v[2] += 0.25*v[2];
+          evm.v[3] += 0.25*v[3];
+          evm.v[4] += 0.25*v[4];
+          evm.v[5] += 0.25*v[5];
+        }
+        if (i2 < nlocal) {
+          evm.v[0] += 0.25*v[0];
+          evm.v[1] += 0.25*v[1];
+          evm.v[2] += 0.25*v[2];
+          evm.v[3] += 0.25*v[3];
+          evm.v[4] += 0.25*v[4];
+          evm.v[5] += 0.25*v[5];
+        }
+        if (i3 < nlocal) {
+          evm.v[0] += 0.25*v[0];
+          evm.v[1] += 0.25*v[1];
+          evm.v[2] += 0.25*v[2];
+          evm.v[3] += 0.25*v[3];
+          evm.v[4] += 0.25*v[4];
+          evm.v[5] += 0.25*v[5];
+        }
+        if (i4 < nlocal) {
+          evm.v[0] += 0.25*v[0];
+          evm.v[1] += 0.25*v[1];
+          evm.v[2] += 0.25*v[2];
+          evm.v[3] += 0.25*v[3];
+          evm.v[4] += 0.25*v[4];
+          evm.v[5] += 0.25*v[5];
+        }
+      }
+    }
+
+    if (vflag_atom) {
+      if (newton_bond || i1 < nlocal) {
+        d_vatom(i1,0) += 0.25*v[0];
+        d_vatom(i1,1) += 0.25*v[1];
+        d_vatom(i1,2) += 0.25*v[2];
+        d_vatom(i1,3) += 0.25*v[3];
+        d_vatom(i1,4) += 0.25*v[4];
+        d_vatom(i1,5) += 0.25*v[5];
+      }
+      if (newton_bond || i2 < nlocal) {
+        d_vatom(i2,0) += 0.25*v[0];
+        d_vatom(i2,1) += 0.25*v[1];
+        d_vatom(i2,2) += 0.25*v[2];
+        d_vatom(i2,3) += 0.25*v[3];
+        d_vatom(i2,4) += 0.25*v[4];
+        d_vatom(i2,5) += 0.25*v[5];
+      }
+      if (newton_bond || i3 < nlocal) {
+        d_vatom(i3,0) += 0.25*v[0];
+        d_vatom(i3,1) += 0.25*v[1];
+        d_vatom(i3,2) += 0.25*v[2];
+        d_vatom(i3,3) += 0.25*v[3];
+        d_vatom(i3,4) += 0.25*v[4];
+        d_vatom(i3,5) += 0.25*v[5];
+      }
+      if (newton_bond || i4 < nlocal) {
+        d_vatom(i4,0) += 0.25*v[0];
+        d_vatom(i4,1) += 0.25*v[1];
+        d_vatom(i4,2) += 0.25*v[2];
+        d_vatom(i4,3) += 0.25*v[3];
+        d_vatom(i4,4) += 0.25*v[4];
+        d_vatom(i4,5) += 0.25*v[5];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   tally eng_vdwl and virial into global and per-atom accumulators
+   need i < nlocal test since called by bond_quartic and dihedral_charmm
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void DihedralCharmmfswKokkos<DeviceType>::ev_tally(EVM_FLOAT &evm, const int i, const int j,
+      const F_FLOAT &evdwl, const F_FLOAT &ecoul, const F_FLOAT &fpair, const F_FLOAT &delx,
+                const F_FLOAT &dely, const F_FLOAT &delz) const
+{
+  E_FLOAT evdwlhalf,ecoulhalf,epairhalf;
+  F_FLOAT v[6];
+
+
+  if (eflag_either) {
+    if (eflag_global) {
+      if (newton_bond) {
+        evm.evdwl += evdwl;
+        evm.ecoul += ecoul;
+      } else {
+        evdwlhalf = 0.5*evdwl;
+        ecoulhalf = 0.5*ecoul;
+        if (i < nlocal) {
+          evm.evdwl += evdwlhalf;
+          evm.ecoul += ecoulhalf;
+        }
+        if (j < nlocal) {
+          evm.evdwl += evdwlhalf;
+          evm.ecoul += ecoulhalf;
+        }
+      }
+    }
+    if (eflag_atom) {
+      epairhalf = 0.5 * (evdwl + ecoul);
+      if (newton_bond || i < nlocal) d_eatom_pair[i] += epairhalf;
+      if (newton_bond || j < nlocal) d_eatom_pair[j] += epairhalf;
+    }
+  }
+
+  if (vflag_either) {
+    v[0] = delx*delx*fpair;
+    v[1] = dely*dely*fpair;
+    v[2] = delz*delz*fpair;
+    v[3] = delx*dely*fpair;
+    v[4] = delx*delz*fpair;
+    v[5] = dely*delz*fpair;
+
+    if (vflag_global) {
+      if (newton_bond) {
+        evm.vp[0] += v[0];
+        evm.vp[1] += v[1];
+        evm.vp[2] += v[2];
+        evm.vp[3] += v[3];
+        evm.vp[4] += v[4];
+        evm.vp[5] += v[5];
+      } else {
+        if (i < nlocal) {
+          evm.vp[0] += 0.5*v[0];
+          evm.vp[1] += 0.5*v[1];
+          evm.vp[2] += 0.5*v[2];
+          evm.vp[3] += 0.5*v[3];
+          evm.vp[4] += 0.5*v[4];
+          evm.vp[5] += 0.5*v[5];
+        }
+        if (j < nlocal) {
+          evm.vp[0] += 0.5*v[0];
+          evm.vp[1] += 0.5*v[1];
+          evm.vp[2] += 0.5*v[2];
+          evm.vp[3] += 0.5*v[3];
+          evm.vp[4] += 0.5*v[4];
+          evm.vp[5] += 0.5*v[5];
+        }
+      }
+    }
+
+    if (vflag_atom) {
+      if (newton_bond || i < nlocal) {
+        d_vatom_pair(i,0) += 0.5*v[0];
+        d_vatom_pair(i,1) += 0.5*v[1];
+        d_vatom_pair(i,2) += 0.5*v[2];
+        d_vatom_pair(i,3) += 0.5*v[3];
+        d_vatom_pair(i,4) += 0.5*v[4];
+        d_vatom_pair(i,5) += 0.5*v[5];
+      }
+      if (newton_bond || j < nlocal) {
+        d_vatom_pair(j,0) += 0.5*v[0];
+        d_vatom_pair(j,1) += 0.5*v[1];
+        d_vatom_pair(j,2) += 0.5*v[2];
+        d_vatom_pair(j,3) += 0.5*v[3];
+        d_vatom_pair(j,4) += 0.5*v[4];
+        d_vatom_pair(j,5) += 0.5*v[5];
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+namespace LAMMPS_NS {
+template class DihedralCharmmfswKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class DihedralCharmmfswKokkos<LMPHostType>;
+#endif
+}
+
diff --git a/src/KOKKOS/dihedral_charmmfsw_kokkos.h b/src/KOKKOS/dihedral_charmmfsw_kokkos.h
new file mode 100644
index 0000000000..b1c65ae477
--- /dev/null
+++ b/src/KOKKOS/dihedral_charmmfsw_kokkos.h
@@ -0,0 +1,118 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+// clang-format off
+DihedralStyle(charmmfsw/kk,DihedralCharmmfswKokkos<LMPDeviceType>);
+DihedralStyle(charmmfsw/kk/device,DihedralCharmmfswKokkos<LMPDeviceType>);
+DihedralStyle(charmmfsw/kk/host,DihedralCharmmfswKokkos<LMPHostType>);
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_DIHEDRAL_CHARMMFSW_KOKKOS_H
+#define LMP_DIHEDRAL_CHARMMFSW_KOKKOS_H
+
+#include "dihedral_charmmfsw.h"
+#include "kokkos_type.h"
+#include "dihedral_charmm_kokkos.h" // needed for s_EVM_FLOAT
+
+namespace LAMMPS_NS {
+
+template<int NEWTON_BOND, int EVFLAG>
+struct TagDihedralCharmmfswCompute{};
+
+template<class DeviceType>
+class DihedralCharmmfswKokkos : public DihedralCharmmfsw {
+ public:
+  typedef DeviceType device_type;
+  typedef EVM_FLOAT value_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  DihedralCharmmfswKokkos(class LAMMPS *);
+  ~DihedralCharmmfswKokkos() override;
+  void compute(int, int) override;
+  void coeff(int, char **) override;
+  void init_style() override;
+  void read_restart(FILE *) override;
+
+  template<int NEWTON_BOND, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagDihedralCharmmfswCompute<NEWTON_BOND,EVFLAG>, const int&, EVM_FLOAT&) const;
+
+  template<int NEWTON_BOND, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagDihedralCharmmfswCompute<NEWTON_BOND,EVFLAG>, const int&) const;
+
+  //template<int NEWTON_BOND>
+  KOKKOS_INLINE_FUNCTION
+  void ev_tally(EVM_FLOAT &evm, const int i1, const int i2, const int i3, const int i4,
+                          F_FLOAT &edihedral, F_FLOAT *f1, F_FLOAT *f3, F_FLOAT *f4,
+                          const F_FLOAT &vb1x, const F_FLOAT &vb1y, const F_FLOAT &vb1z,
+                          const F_FLOAT &vb2x, const F_FLOAT &vb2y, const F_FLOAT &vb2z,
+                          const F_FLOAT &vb3x, const F_FLOAT &vb3y, const F_FLOAT &vb3z) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void ev_tally(EVM_FLOAT &evm, const int i, const int j,
+        const F_FLOAT &evdwl, const F_FLOAT &ecoul, const F_FLOAT &fpair, const F_FLOAT &delx,
+                const F_FLOAT &dely, const F_FLOAT &delz) const;
+
+ protected:
+
+  class NeighborKokkos *neighborKK;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_int_1d_randomread atomtype;
+  typename AT::t_ffloat_1d_randomread q;
+  typename AT::t_f_array f;
+  typename AT::t_int_2d dihedrallist;
+
+  typedef typename KKDevice<DeviceType>::value KKDeviceType;
+  Kokkos::DualView<E_FLOAT*,Kokkos::LayoutRight,KKDeviceType> k_eatom;
+  Kokkos::DualView<F_FLOAT*[6],Kokkos::LayoutRight,KKDeviceType> k_vatom;
+  Kokkos::View<E_FLOAT*,Kokkos::LayoutRight,KKDeviceType,Kokkos::MemoryTraits<Kokkos::Atomic> > d_eatom;
+  Kokkos::View<F_FLOAT*[6],Kokkos::LayoutRight,KKDeviceType,Kokkos::MemoryTraits<Kokkos::Atomic> > d_vatom;
+
+  Kokkos::DualView<E_FLOAT*,Kokkos::LayoutRight,KKDeviceType> k_eatom_pair;
+  Kokkos::DualView<F_FLOAT*[6],Kokkos::LayoutRight,KKDeviceType> k_vatom_pair;
+  Kokkos::View<E_FLOAT*,Kokkos::LayoutRight,KKDeviceType,Kokkos::MemoryTraits<Kokkos::Atomic> > d_eatom_pair;
+  Kokkos::View<F_FLOAT*[6],Kokkos::LayoutRight,KKDeviceType,Kokkos::MemoryTraits<Kokkos::Atomic> > d_vatom_pair;
+
+  int nlocal,newton_bond;
+  int eflag,vflag;
+  double qqrd2e;
+
+  Kokkos::DualView<int,DeviceType> k_warning_flag;
+  typename Kokkos::DualView<int,DeviceType>::t_dev d_warning_flag;
+  typename Kokkos::DualView<int,DeviceType>::t_host h_warning_flag;
+
+  typename AT::t_ffloat_2d d_lj14_1;
+  typename AT::t_ffloat_2d d_lj14_2;
+  typename AT::t_ffloat_2d d_lj14_3;
+  typename AT::t_ffloat_2d d_lj14_4;
+
+  typename AT::t_ffloat_1d d_k;
+  typename AT::t_ffloat_1d d_multiplicity;
+  typename AT::t_ffloat_1d d_shift;
+  typename AT::t_ffloat_1d d_sin_shift;
+  typename AT::t_ffloat_1d d_cos_shift;
+  typename AT::t_ffloat_1d d_weight;
+
+  void allocate() override;
+};
+
+}
+
+#endif
+#endif
+
diff --git a/src/KOKKOS/dynamical_matrix_kokkos.cpp b/src/KOKKOS/dynamical_matrix_kokkos.cpp
index 32986025e6..ec2cc17ef2 100644
--- a/src/KOKKOS/dynamical_matrix_kokkos.cpp
+++ b/src/KOKKOS/dynamical_matrix_kokkos.cpp
@@ -174,72 +174,45 @@ void DynamicalMatrixKokkos::update_force()
   }
 
   bool execute_on_host = false;
-  unsigned int datamask_read_device = 0;
-  unsigned int datamask_modify_device = 0;
   unsigned int datamask_read_host = 0;
 
   if (pair_compute_flag) {
     if (force->pair->execution_space==Host) {
       execute_on_host  = true;
       datamask_read_host   |= force->pair->datamask_read;
-      datamask_modify_device |= force->pair->datamask_modify;
-    } else {
-      datamask_read_device   |= force->pair->datamask_read;
-      datamask_modify_device |= force->pair->datamask_modify;
     }
   }
   if (atomKK->molecular && force->bond)  {
     if (force->bond->execution_space==Host) {
       execute_on_host  = true;
       datamask_read_host   |= force->bond->datamask_read;
-      datamask_modify_device |= force->bond->datamask_modify;
-    } else {
-      datamask_read_device   |= force->bond->datamask_read;
-      datamask_modify_device |= force->bond->datamask_modify;
     }
   }
   if (atomKK->molecular && force->angle) {
     if (force->angle->execution_space==Host) {
       execute_on_host  = true;
       datamask_read_host   |= force->angle->datamask_read;
-      datamask_modify_device |= force->angle->datamask_modify;
-    } else {
-      datamask_read_device   |= force->angle->datamask_read;
-      datamask_modify_device |= force->angle->datamask_modify;
     }
   }
   if (atomKK->molecular && force->dihedral) {
     if (force->dihedral->execution_space==Host) {
       execute_on_host  = true;
       datamask_read_host   |= force->dihedral->datamask_read;
-      datamask_modify_device |= force->dihedral->datamask_modify;
-    } else {
-      datamask_read_device   |= force->dihedral->datamask_read;
-      datamask_modify_device |= force->dihedral->datamask_modify;
     }
   }
   if (atomKK->molecular && force->improper) {
     if (force->improper->execution_space==Host) {
       execute_on_host  = true;
       datamask_read_host   |= force->improper->datamask_read;
-      datamask_modify_device |= force->improper->datamask_modify;
-    } else {
-      datamask_read_device   |= force->improper->datamask_read;
-      datamask_modify_device |= force->improper->datamask_modify;
     }
   }
   if (kspace_compute_flag) {
     if (force->kspace->execution_space==Host) {
       execute_on_host  = true;
       datamask_read_host   |= force->kspace->datamask_read;
-      datamask_modify_device |= force->kspace->datamask_modify;
-    } else {
-      datamask_read_device   |= force->kspace->datamask_read;
-      datamask_modify_device |= force->kspace->datamask_modify;
     }
   }
 
-
   if (pair_compute_flag) {
     atomKK->sync(force->pair->execution_space,force->pair->datamask_read);
     atomKK->sync(force->pair->execution_space,~(~force->pair->datamask_read|(F_MASK | ENERGY_MASK | VIRIAL_MASK)));
diff --git a/src/KOKKOS/fix_acks2_reaxff_kokkos.cpp b/src/KOKKOS/fix_acks2_reaxff_kokkos.cpp
index 59ed918729..9c34908d08 100644
--- a/src/KOKKOS/fix_acks2_reaxff_kokkos.cpp
+++ b/src/KOKKOS/fix_acks2_reaxff_kokkos.cpp
@@ -192,7 +192,7 @@ void FixACKS2ReaxFFKokkos<DeviceType>::setup_pre_force(int vflag)
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
-void FixACKS2ReaxFFKokkos<DeviceType>::pre_force(int vflag)
+void FixACKS2ReaxFFKokkos<DeviceType>::pre_force(int /*vflag*/)
 {
   if (update->ntimestep % nevery) return;
 
@@ -298,8 +298,8 @@ void FixACKS2ReaxFFKokkos<DeviceType>::pre_force(int vflag)
   } else { // GPU, use teams
     Kokkos::deep_copy(d_mfill_offset,0);
 
-    int vector_length = 32;
     int atoms_per_team = 4;
+    int vector_length = 32;
     int num_teams = nn / atoms_per_team + (nn % atoms_per_team ? 1 : 0);
 
     Kokkos::TeamPolicy<DeviceType> policy(num_teams, atoms_per_team,
diff --git a/src/KOKKOS/fix_acks2_reaxff_kokkos.h b/src/KOKKOS/fix_acks2_reaxff_kokkos.h
index 127c8d0402..c27719c364 100644
--- a/src/KOKKOS/fix_acks2_reaxff_kokkos.h
+++ b/src/KOKKOS/fix_acks2_reaxff_kokkos.h
@@ -289,8 +289,7 @@ struct FixACKS2ReaxFFKokkosComputeHFunctor {
 
   FixACKS2ReaxFFKokkosComputeHFunctor(FixACKS2ReaxFFKokkos<DeviceType> *c_ptr,
                                   int _atoms_per_team, int _vector_length)
-      : c(*c_ptr), atoms_per_team(_atoms_per_team),
-        vector_length(_vector_length) {
+      : atoms_per_team(_atoms_per_team), vector_length(_vector_length), c(*c_ptr) {
     c.cleanup_copy();
   };
 
@@ -337,8 +336,7 @@ struct FixACKS2ReaxFFKokkosComputeXFunctor {
 
   FixACKS2ReaxFFKokkosComputeXFunctor(FixACKS2ReaxFFKokkos<DeviceType> *c_ptr,
                                   int _atoms_per_team, int _vector_length)
-      : c(*c_ptr), atoms_per_team(_atoms_per_team),
-        vector_length(_vector_length) {
+    : atoms_per_team(_atoms_per_team), vector_length(_vector_length), c(*c_ptr) {
     c.cleanup_copy();
   };
 
diff --git a/src/KOKKOS/fix_neigh_history_kokkos.cpp b/src/KOKKOS/fix_neigh_history_kokkos.cpp
index b4a852ba70..49fe3f1177 100644
--- a/src/KOKKOS/fix_neigh_history_kokkos.cpp
+++ b/src/KOKKOS/fix_neigh_history_kokkos.cpp
@@ -453,8 +453,12 @@ KOKKOS_INLINE_FUNCTION
 void FixNeighHistoryKokkos<DeviceType>::operator()(TagFixNeighHistoryUnpackExchange, const int &i) const
 {
   int index = d_indices(i);
+
   if (index > -1) {
     int m = (int) d_ubuf(d_buf(i)).i;
+    if (i >= nrecv1)
+      m = nextrarecv1 + (int) d_ubuf(d_buf(nextrarecv1 + i - nrecv1)).i;
+
     int n = (int) d_ubuf(d_buf(m++)).i;
     d_npartner(index) = n;
     for (int p = 0; p < n; p++) {
@@ -471,6 +475,7 @@ void FixNeighHistoryKokkos<DeviceType>::operator()(TagFixNeighHistoryUnpackExcha
 template<class DeviceType>
 void FixNeighHistoryKokkos<DeviceType>::unpack_exchange_kokkos(
   DAT::tdual_xfloat_2d &k_buf, DAT::tdual_int_1d &k_indices, int nrecv,
+  int nrecv1, int nextrarecv1,
   ExecutionSpace /*space*/)
 {
   d_buf = typename AT::t_xfloat_1d_um(
@@ -478,6 +483,9 @@ void FixNeighHistoryKokkos<DeviceType>::unpack_exchange_kokkos(
     k_buf.extent(0)*k_buf.extent(1));
   d_indices = k_indices.view<DeviceType>();
 
+  this->nrecv1 = nrecv1;
+  this->nextrarecv1 = nextrarecv1;
+
   d_npartner = k_npartner.template view<DeviceType>();
   d_partner = k_partner.template view<DeviceType>();
   d_valuepartner = k_valuepartner.template view<DeviceType>();
diff --git a/src/KOKKOS/fix_neigh_history_kokkos.h b/src/KOKKOS/fix_neigh_history_kokkos.h
index 9c07a953c4..dd1ad769b8 100644
--- a/src/KOKKOS/fix_neigh_history_kokkos.h
+++ b/src/KOKKOS/fix_neigh_history_kokkos.h
@@ -72,12 +72,14 @@ class FixNeighHistoryKokkos : public FixNeighHistory, public KokkosBase {
 
   void unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,
                               DAT::tdual_int_1d &indices,int nrecv,
+                              int nrecv1,int nrecv1extra,
                               ExecutionSpace space) override;
 
   typename DAT::tdual_int_2d k_firstflag;
   typename DAT::tdual_float_2d k_firstvalue;
 
  private:
+  int nrecv1,nextrarecv1;
   int nlocal,nsend,beyond_contact;
 
   typename AT::t_tagint_1d tag;
diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
index bd65a6965e..948e3b88f6 100644
--- a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
+++ b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
@@ -128,7 +128,7 @@ void FixNVTSllodKokkos<DeviceType>::nh_v_temp()
 
   d_h_two = Few<double, 6>(h_two);
 
-  if (vdelu.extent(0) < atomKK->nmax)
+  if ((int)vdelu.extent(0) < atomKK->nmax)
     vdelu = typename AT::t_v_array(Kokkos::NoInit("nvt/sllod/kk:vdelu"), atomKK->nmax);
 
   if (!this->psllod_flag) {
diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp
index a2a50d84bb..18d7af75a7 100644
--- a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp
+++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp
@@ -1416,6 +1416,7 @@ KOKKOS_INLINE_FUNCTION
 void FixQEqReaxFFKokkos<DeviceType>::operator()(TagQEqUnpackExchange, const int &i) const
 {
   int index = d_indices(i);
+
   if (index > -1) {
     for (int m = 0; m < nprev; m++) d_s_hist(index,m) = d_buf(i*nprev*2 + m);
     for (int m = 0; m < nprev; m++) d_t_hist(index,m) = d_buf(i*nprev*2 + nprev+m);
@@ -1427,6 +1428,7 @@ void FixQEqReaxFFKokkos<DeviceType>::operator()(TagQEqUnpackExchange, const int
 template <class DeviceType>
 void FixQEqReaxFFKokkos<DeviceType>::unpack_exchange_kokkos(
   DAT::tdual_xfloat_2d &k_buf, DAT::tdual_int_1d &k_indices, int nrecv,
+  int /*nrecv1*/, int /*nextrarecv1*/,
   ExecutionSpace /*space*/)
 {
   k_buf.sync<DeviceType>();
diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.h b/src/KOKKOS/fix_qeq_reaxff_kokkos.h
index 9bc38b0492..6aa345fba6 100644
--- a/src/KOKKOS/fix_qeq_reaxff_kokkos.h
+++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.h
@@ -143,6 +143,7 @@ class FixQEqReaxFFKokkos : public FixQEqReaxFF, public KokkosBase {
 
   void unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,
                               DAT::tdual_int_1d &indices,int nrecv,
+                              int nrecv1,int nextrarecv1,
                               ExecutionSpace space) override;
 
   struct params_qeq{
diff --git a/src/KOKKOS/fix_shake_kokkos.cpp b/src/KOKKOS/fix_shake_kokkos.cpp
index dd6de8f9ec..39f4f4d4fe 100644
--- a/src/KOKKOS/fix_shake_kokkos.cpp
+++ b/src/KOKKOS/fix_shake_kokkos.cpp
@@ -525,7 +525,7 @@ void FixShakeKokkos<DeviceType>::operator()(TagFixShakePostForce<NEIGHFLAG,EVFLA
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
-int FixShakeKokkos<DeviceType>::dof(int igroup)
+bigint FixShakeKokkos<DeviceType>::dof(int igroup)
 {
   d_mask = atomKK->k_mask.view<DeviceType>();
   d_tag = atomKK->k_tag.view<DeviceType>();
@@ -538,7 +538,7 @@ int FixShakeKokkos<DeviceType>::dof(int igroup)
   // count dof in a cluster if and only if
   // the central atom is in group and atom i is the central atom
 
-  int n = 0;
+  bigint n = 0;
   {
     // local variables for lambda capture
 
@@ -549,7 +549,7 @@ int FixShakeKokkos<DeviceType>::dof(int igroup)
     auto groupbit = group->bitmask[igroup];
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,nlocal),
-     LAMMPS_LAMBDA(const int& i, int& n) {
+     LAMMPS_LAMBDA(const int& i, bigint& n) {
       if (!(mask[i] & groupbit)) return;
       if (d_shake_flag[i] == 0) return;
       if (d_shake_atom(i,0) != tag[i]) return;
@@ -560,8 +560,8 @@ int FixShakeKokkos<DeviceType>::dof(int igroup)
     },n);
   }
 
-  int nall;
-  MPI_Allreduce(&n,&nall,1,MPI_INT,MPI_SUM,world);
+  bigint nall;
+  MPI_Allreduce(&n,&nall,1,MPI_LMP_BIGINT,MPI_SUM,world);
   return nall;
 }
 
@@ -1581,8 +1581,8 @@ void FixShakeKokkos<DeviceType>::pack_exchange_item(const int &mysend, int &offs
     else offset++;
   } else {
 
-    d_buf[mysend] = nsend + offset;
     int m = nsend + offset;
+    d_buf[mysend] = m;
     d_buf[m++] = flag;
     if (flag == 1) {
       d_buf[m++] = d_shake_atom(i,0);
@@ -1703,6 +1703,8 @@ void FixShakeKokkos<DeviceType>::operator()(TagFixShakeUnpackExchange, const int
 
   if (index > -1) {
     int m = d_buf[i];
+    if (i >= nrecv1)
+      m = nextrarecv1 + d_buf[nextrarecv1 + i - nrecv1];
 
     int flag = d_shake_flag[index] = static_cast<int> (d_buf[m++]);
     if (flag == 1) {
@@ -1739,6 +1741,7 @@ void FixShakeKokkos<DeviceType>::operator()(TagFixShakeUnpackExchange, const int
 template <class DeviceType>
 void FixShakeKokkos<DeviceType>::unpack_exchange_kokkos(
   DAT::tdual_xfloat_2d &k_buf, DAT::tdual_int_1d &k_indices, int nrecv,
+  int nrecv1, int nextrarecv1,
   ExecutionSpace /*space*/)
 {
   k_buf.sync<DeviceType>();
@@ -1749,6 +1752,9 @@ void FixShakeKokkos<DeviceType>::unpack_exchange_kokkos(
     k_buf.extent(0)*k_buf.extent(1));
   d_indices = k_indices.view<DeviceType>();
 
+  this->nrecv1 = nrecv1;
+  this->nextrarecv1 = nextrarecv1;
+
   k_shake_flag.template sync<DeviceType>();
   k_shake_atom.template sync<DeviceType>();
   k_shake_type.template sync<DeviceType>();
diff --git a/src/KOKKOS/fix_shake_kokkos.h b/src/KOKKOS/fix_shake_kokkos.h
index 185e69ce86..19f3a2343d 100644
--- a/src/KOKKOS/fix_shake_kokkos.h
+++ b/src/KOKKOS/fix_shake_kokkos.h
@@ -44,8 +44,6 @@ struct TagFixShakeUnpackExchange{};
 template<class DeviceType>
 class FixShakeKokkos : public FixShake, public KokkosBase {
 
- //friend class FixEHEX;
-
  public:
   typedef DeviceType device_type;
   typedef EV_FLOAT value_type;
@@ -77,7 +75,7 @@ class FixShakeKokkos : public FixShake, public KokkosBase {
   void shake_end_of_step(int vflag) override;
   void correct_coordinates(int vflag) override;
 
-  int dof(int) override;
+  bigint dof(int) override;
 
   void unconstrained_update() override;
 
@@ -112,9 +110,12 @@ class FixShakeKokkos : public FixShake, public KokkosBase {
 
   void unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,
                               DAT::tdual_int_1d &indices,int nrecv,
+                              int nrecv1,int nrecv1extra,
                               ExecutionSpace space) override;
 
  protected:
+  int nrecv1,nextrarecv1;
+
   typename AT::t_x_array d_x;
   typename AT::t_v_array d_v;
   typename AT::t_f_array d_f;
@@ -259,4 +260,3 @@ struct FixShakeKokkosPackExchangeFunctor {
 
 #endif
 #endif
-
diff --git a/src/KOKKOS/fix_spring_self_kokkos.cpp b/src/KOKKOS/fix_spring_self_kokkos.cpp
index efd8a652ff..6571db37ed 100644
--- a/src/KOKKOS/fix_spring_self_kokkos.cpp
+++ b/src/KOKKOS/fix_spring_self_kokkos.cpp
@@ -184,12 +184,12 @@ void FixSpringSelfKokkos<DeviceType>::copy_arrays(int i, int j, int delflag)
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void FixSpringSelfKokkos<DeviceType>::pack_exchange_item(const int &mysend, int &offset, const bool &final) const
+void FixSpringSelfKokkos<DeviceType>::pack_exchange_item(const int &mysend, int &offset, const bool &/*final*/) const
 {
   const int i = d_exchange_sendlist(mysend);
 
-  d_buf[mysend] = nsend + offset;
   int m = nsend + offset;
+  d_buf[mysend] = m;
   d_buf[m++] = d_xoriginal(i,0);
   d_buf[m++] = d_xoriginal(i,1);
   d_buf[m++] = d_xoriginal(i,2);
@@ -258,6 +258,8 @@ void FixSpringSelfKokkos<DeviceType>::operator()(TagFixSpringSelfUnpackExchange,
 
   if (index > -1) {
     int m = d_buf[i];
+    if (i >= nrecv1)
+      m = nextrarecv1 + d_buf[nextrarecv1 + i - nrecv1];
 
     d_xoriginal(index,0) = static_cast<tagint> (d_buf[m++]);
     d_xoriginal(index,1) = static_cast<tagint> (d_buf[m++]);
@@ -270,6 +272,7 @@ void FixSpringSelfKokkos<DeviceType>::operator()(TagFixSpringSelfUnpackExchange,
 template <class DeviceType>
 void FixSpringSelfKokkos<DeviceType>::unpack_exchange_kokkos(
   DAT::tdual_xfloat_2d &k_buf, DAT::tdual_int_1d &k_indices, int nrecv,
+  int nrecv1, int nextrarecv1,
   ExecutionSpace /*space*/)
 {
   k_buf.sync<DeviceType>();
@@ -280,6 +283,9 @@ void FixSpringSelfKokkos<DeviceType>::unpack_exchange_kokkos(
     k_buf.extent(0)*k_buf.extent(1));
   d_indices = k_indices.view<DeviceType>();
 
+  this->nrecv1 = nrecv1;
+  this->nextrarecv1 = nextrarecv1;
+
   k_xoriginal.template sync<DeviceType>();
 
   copymode = 1;
diff --git a/src/KOKKOS/fix_spring_self_kokkos.h b/src/KOKKOS/fix_spring_self_kokkos.h
index b23e92249b..add5a80bc7 100644
--- a/src/KOKKOS/fix_spring_self_kokkos.h
+++ b/src/KOKKOS/fix_spring_self_kokkos.h
@@ -58,6 +58,7 @@ class FixSpringSelfKokkos : public FixSpringSelf, public KokkosBase {
 
   void unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,
                               DAT::tdual_int_1d &indices,int nrecv,
+                              int nrecv1,int nrecv1extra,
                               ExecutionSpace space) override;
 
 
@@ -65,6 +66,8 @@ class FixSpringSelfKokkos : public FixSpringSelf, public KokkosBase {
   int unpack_exchange(int, double *) override;
 
  protected:
+  int nrecv1,nextrarecv1;
+
   DAT::tdual_x_array k_xoriginal;
   typename AT::t_x_array d_xoriginal;
 
diff --git a/src/KOKKOS/fix_wall_gran_kokkos.cpp b/src/KOKKOS/fix_wall_gran_kokkos.cpp
index f870b0f240..25e405c798 100644
--- a/src/KOKKOS/fix_wall_gran_kokkos.cpp
+++ b/src/KOKKOS/fix_wall_gran_kokkos.cpp
@@ -419,6 +419,7 @@ void FixWallGranKokkos<DeviceType>::operator()(TagFixWallGranUnpackExchange, con
 template<class DeviceType>
 void FixWallGranKokkos<DeviceType>::unpack_exchange_kokkos(
   DAT::tdual_xfloat_2d &k_buf, DAT::tdual_int_1d &k_indices, int nrecv,
+  int /*nrecv1*/, int /*nextrarecv1*/,
   ExecutionSpace /*space*/)
 {
   d_buf = typename ArrayTypes<DeviceType>::t_xfloat_1d_um(
@@ -430,7 +431,6 @@ void FixWallGranKokkos<DeviceType>::unpack_exchange_kokkos(
 
   copymode = 1;
 
-
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagFixWallGranUnpackExchange>(0,nrecv),*this);
 
   copymode = 0;
diff --git a/src/KOKKOS/fix_wall_gran_kokkos.h b/src/KOKKOS/fix_wall_gran_kokkos.h
index c7d566ec72..ae54fdb085 100644
--- a/src/KOKKOS/fix_wall_gran_kokkos.h
+++ b/src/KOKKOS/fix_wall_gran_kokkos.h
@@ -62,12 +62,13 @@ class FixWallGranKokkos : public FixWallGranOld, public KokkosBase {
   void operator()(TagFixWallGranUnpackExchange, const int&) const;
 
   int pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &buf,
-			   DAT::tdual_int_1d k_sendlist,
-			   DAT::tdual_int_1d k_copylist,
-			   ExecutionSpace space) override;
+                           DAT::tdual_int_1d k_sendlist,
+                           DAT::tdual_int_1d k_copylist,
+                           ExecutionSpace space) override;
 
   void unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,
                               DAT::tdual_int_1d &indices,int nrecv,
+                              int nrecv1,int nrecv1extra,
                               ExecutionSpace space) override;
 
  private:
@@ -91,6 +92,7 @@ class FixWallGranKokkos : public FixWallGranOld, public KokkosBase {
   typename AT::t_int_1d d_copylist;
   typename AT::t_int_1d d_indices;
 };
+
 }
 
 #endif
diff --git a/src/KOKKOS/grid3d_kokkos.cpp b/src/KOKKOS/grid3d_kokkos.cpp
index 7b97c417dd..87f2baff84 100644
--- a/src/KOKKOS/grid3d_kokkos.cpp
+++ b/src/KOKKOS/grid3d_kokkos.cpp
@@ -635,7 +635,7 @@ void Grid3dKokkos<DeviceType>::setup_comm_tiled(int &nbuf1, int &nbuf2)
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
-void Grid3dKokkos<DeviceType>::forward_comm(int caller, void *ptr, int which, int nper, int nbyte,
+void Grid3dKokkos<DeviceType>::forward_comm(int caller, void *ptr, int which, int nper, int /*nbyte*/,
                             FFT_DAT::tdual_FFT_SCALAR_1d& k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d& k_buf2,
                             MPI_Datatype datatype)
 {
@@ -645,7 +645,7 @@ void Grid3dKokkos<DeviceType>::forward_comm(int caller, void *ptr, int which, in
   else
     forward_comm_kspace_tiled((KSpace *) ptr,which,nper,k_buf1,k_buf2,datatype);
   } else
-    error->all(FLERR,"Kokkos grid comm only supports Kspace");
+    error->all(FLERR,"Kokkos grid comm currently only supports Kspace");
 }
 
 /* ----------------------------------------------------------------------
@@ -775,7 +775,7 @@ forward_comm_kspace_tiled(KSpace *kspace, int which, int nper,
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
-void Grid3dKokkos<DeviceType>::reverse_comm(int caller, void *ptr, int which, int nper, int nbyte,
+void Grid3dKokkos<DeviceType>::reverse_comm(int caller, void *ptr, int which, int nper, int /*nbyte*/,
                             FFT_DAT::tdual_FFT_SCALAR_1d& k_buf1, FFT_DAT::tdual_FFT_SCALAR_1d& k_buf2,
                             MPI_Datatype datatype)
 {
@@ -945,7 +945,7 @@ int Grid3dKokkos<DeviceType>::indices(DAT::tdual_int_2d &k_list, int index,
                        int xlo, int xhi, int ylo, int yhi, int zlo, int zhi)
 {
   int nmax = (xhi-xlo+1) * (yhi-ylo+1) * (zhi-zlo+1);
-  if (k_list.extent(1) < nmax)
+  if ((int)k_list.extent(1) < nmax)
     k_list.resize(k_list.extent(0),nmax);
 
   if (nmax == 0) return 0;
diff --git a/src/KOKKOS/kissfft_kokkos.h b/src/KOKKOS/kissfft_kokkos.h
index 265677a21c..e24768f774 100644
--- a/src/KOKKOS/kissfft_kokkos.h
+++ b/src/KOKKOS/kissfft_kokkos.h
@@ -489,7 +489,7 @@ class KissFFTKokkos {
    * It can be freed with free(), rather than a kiss_fft-specific function.
    */
 
-  static kiss_fft_state_kokkos<DeviceType> kiss_fft_alloc_kokkos(int nfft, int inverse_fft, void *mem, size_t *lenmem)
+  static kiss_fft_state_kokkos<DeviceType> kiss_fft_alloc_kokkos(int nfft, int inverse_fft, void * /*mem*/, size_t * /*lenmem*/)
   {
       kiss_fft_state_kokkos<DeviceType> st;
       int i;
diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp
index 5572f69901..b8bcd80a00 100644
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@@ -622,7 +622,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
 
 int KokkosLMP::neigh_count(int m)
 {
-  int inum;
+  int inum = 0;
   int nneigh = 0;
 
   ArrayTypes<LMPHostType>::t_int_1d h_ilist;
diff --git a/src/KOKKOS/kokkos_base.h b/src/KOKKOS/kokkos_base.h
index 1e22a38657..24fcc47579 100644
--- a/src/KOKKOS/kokkos_base.h
+++ b/src/KOKKOS/kokkos_base.h
@@ -47,6 +47,7 @@ class KokkosBase {
                                    ExecutionSpace /*space*/) { return 0; }
   virtual void unpack_exchange_kokkos(DAT::tdual_xfloat_2d & /*k_buf*/,
                                       DAT::tdual_int_1d & /*indices*/, int /*nrecv*/,
+                                      int /*nrecv1*/, int /*nextrarecv1*/,
                                       ExecutionSpace /*space*/) {}
 
   // Region
diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h
index c8ab2198d6..1009e43196 100644
--- a/src/KOKKOS/kokkos_type.h
+++ b/src/KOKKOS/kokkos_type.h
@@ -453,13 +453,6 @@ struct alignas(2*sizeof(F_FLOAT)) s_FLOAT2 {
     v[0] = v[1] = 0.0;
   }
 
-  KOKKOS_INLINE_FUNCTION
-  s_FLOAT2(const s_FLOAT2 & rhs) {
-    for (int i = 0; i < 2; i++){
-      v[i] = rhs.v[i];
-    }
-  }
-
   KOKKOS_INLINE_FUNCTION
   void operator+=(const s_FLOAT2 &rhs) {
     v[0] += rhs.v[0];
diff --git a/src/KOKKOS/min_linesearch_kokkos.cpp b/src/KOKKOS/min_linesearch_kokkos.cpp
index e8a22f9ddb..2d424957c5 100644
--- a/src/KOKKOS/min_linesearch_kokkos.cpp
+++ b/src/KOKKOS/min_linesearch_kokkos.cpp
@@ -59,8 +59,8 @@ MinLineSearchKokkos::MinLineSearchKokkos(LAMMPS *lmp) : MinKokkos(lmp)
 
 MinLineSearchKokkos::~MinLineSearchKokkos()
 {
-  delete [] gextra;
-  delete [] hextra;
+  delete[] gextra;
+  delete[] hextra;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -171,8 +171,8 @@ int MinLineSearchKokkos::linemin_quadratic(double eoriginal, double &alpha)
 {
   double fdothall,fdothme,hme,hmaxall;
   double de_ideal,de;
-  double delfh,engprev,relerr,alphaprev,fhprev,ff,fh,alpha0;
-  double dot[2],dotall[2];
+  double delfh,engprev,relerr,alphaprev,fhprev,fh,alpha0;
+  double dot,dotall;
   double alphamax;
 
   fix_minimize_kk->k_vectors.sync<LMPDeviceType>();
@@ -280,22 +280,16 @@ int MinLineSearchKokkos::linemin_quadratic(double eoriginal, double &alpha)
         sdot.d1 += l_fvec[i]*l_h[i];
       },sdot);
     }
-    dot[0] = sdot.d0;
-    dot[1] = sdot.d1;
+    dot = sdot.d1;
 
-    MPI_Allreduce(dot,dotall,2,MPI_DOUBLE,MPI_SUM,world);
+    MPI_Allreduce(&dot,&dotall,1,MPI_DOUBLE,MPI_SUM,world);
     if (nextra_global) {
       for (int i = 0; i < nextra_global; i++) {
-        dotall[0] += fextra[i]*fextra[i];
-        dotall[1] += fextra[i]*hextra[i];
+        dotall += fextra[i]*hextra[i];
       }
     }
-    ff = dotall[0];
-    fh = dotall[1];
-    if (output->thermo->normflag) {
-      ff /= atom->natoms;
-      fh /= atom->natoms;
-    }
+    fh = dotall;
+    if (output->thermo->normflag) fh /= atom->natoms;
 
     delfh = fh - fhprev;
 
diff --git a/src/KOKKOS/npair_kokkos.h b/src/KOKKOS/npair_kokkos.h
index fe5484a771..8dd7a1c5ef 100644
--- a/src/KOKKOS/npair_kokkos.h
+++ b/src/KOKKOS/npair_kokkos.h
@@ -303,7 +303,7 @@ class NeighborKokkosExecute
                         const typename ArrayTypes<LMPHostType>::t_int_scalar _h_resize,
                         const typename AT::t_int_scalar _new_maxneighs,
                         const typename ArrayTypes<LMPHostType>::t_int_scalar _h_new_maxneighs):
-    neigh_list(_neigh_list), cutneighsq(_cutneighsq),delta(_delta),exclude(_exclude),
+    neigh_list(_neigh_list),delta(_delta),cutneighsq(_cutneighsq),exclude(_exclude),
     nex_type(_nex_type),ex1_type(_ex1_type),ex2_type(_ex2_type),
     ex_type(_ex_type),nex_group(_nex_group),
     ex1_bit(_ex1_bit),ex2_bit(_ex2_bit),
@@ -319,10 +319,11 @@ class NeighborKokkosExecute
     mbinxlo(_mbinxlo),mbinylo(_mbinylo),mbinzlo(_mbinzlo),
     bininvx(_bininvx),bininvy(_bininvy),bininvz(_bininvz),
     nlocal(_nlocal),nall(_nall),neigh_transpose(_neigh_transpose),
+    resize(_resize),new_maxneighs(_new_maxneighs),
+    h_resize(_h_resize),h_new_maxneighs(_h_new_maxneighs),
     xperiodic(_xperiodic),yperiodic(_yperiodic),zperiodic(_zperiodic),
     xprd_half(_xprd_half),yprd_half(_yprd_half),zprd_half(_zprd_half),
-    skin(_skin),resize(_resize),h_resize(_h_resize),
-    new_maxneighs(_new_maxneighs),h_new_maxneighs(_h_new_maxneighs) {
+    skin(_skin) {
 
     if (molecular == 2) moltemplate = 1;
     else moltemplate = 0;
diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h
index 9521268284..87324b49b9 100644
--- a/src/KOKKOS/pair_kokkos.h
+++ b/src/KOKKOS/pair_kokkos.h
@@ -627,7 +627,7 @@ struct PairComputeFunctor  {
       const int itype = c.type(i);
       const F_FLOAT qtmp = c.q(i);
 
-      if (ZEROFLAG) {
+      if (NEIGHFLAG == FULL && ZEROFLAG) {
         Kokkos::single(Kokkos::PerThread(team), [&] (){
           f(i,0) = 0.0;
           f(i,1) = 0.0;
@@ -674,7 +674,7 @@ struct PairComputeFunctor  {
           const int J_CONTRIB = ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal);
           const E_FLOAT factor = J_CONTRIB?1.0:0.5;
 
-          if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal) {
+          if (J_CONTRIB) {
             a_f(j,0) -= fx;
             a_f(j,1) -= fy;
             a_f(j,2) -= fz;
@@ -746,8 +746,10 @@ struct PairComputeFunctor  {
         a_f(i,1) += fev.f[1];
         a_f(i,2) += fev.f[2];
 
-        if (c.eflag_global)
+        if (c.eflag_global) {
           ev.evdwl += fev.evdwl;
+          ev.ecoul += fev.ecoul;
+        }
 
         if (c.vflag_global) {
           ev.v[0] += fev.v[0];
@@ -761,7 +763,7 @@ struct PairComputeFunctor  {
         if (NEIGHFLAG == FULL) {
 
           if (c.eflag_atom)
-            a_eatom(i) += fev.evdwl;
+            a_eatom(i) += fev.evdwl + fev.ecoul;
 
           if (c.vflag_atom) {
             a_vatom(i,0) += fev.v[0];
@@ -948,9 +950,9 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P
 
     static int vectorsize = 0;
     static int atoms_per_team = 0;
-    static int lastcall = -1;
 
 #if defined(LMP_KOKKOS_GPU)
+    static int lastcall = -1;
     if (!vectorsize || lastcall < fpair->lmp->neighbor->lastcall) {
       lastcall = fpair->lmp->update->ntimestep;
       vectorsize = GetMaxNeighs(list);
diff --git a/src/KOKKOS/pair_lj_charmm_coul_long_kokkos.cpp b/src/KOKKOS/pair_lj_charmm_coul_long_kokkos.cpp
index 4caab0ef55..c7e10d39ef 100644
--- a/src/KOKKOS/pair_lj_charmm_coul_long_kokkos.cpp
+++ b/src/KOKKOS/pair_lj_charmm_coul_long_kokkos.cpp
@@ -214,9 +214,7 @@ compute_evdwl(const F_FLOAT& rsq, const int& /*i*/, const int& /*j*/,
       (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) / denom_lj;
     englj *= switch1;
   }
-
   return englj;
-
 }
 
 /* ----------------------------------------------------------------------
@@ -488,4 +486,3 @@ template class PairLJCharmmCoulLongKokkos<LMPDeviceType>;
 template class PairLJCharmmCoulLongKokkos<LMPHostType>;
 #endif
 }
-
diff --git a/src/KOKKOS/pair_lj_charmmfsw_coul_long_kokkos.cpp b/src/KOKKOS/pair_lj_charmmfsw_coul_long_kokkos.cpp
new file mode 100644
index 0000000000..f412721411
--- /dev/null
+++ b/src/KOKKOS/pair_lj_charmmfsw_coul_long_kokkos.cpp
@@ -0,0 +1,497 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mitch Murphy (alphataubio)
+
+   Based on serial kspace lj-fsw sections (force-switched) provided by
+   Robert Meissner and Lucio Colombi Ciacchi of Bremen University, Germany,
+   with additional assistance from Robert A. Latour, Clemson University
+
+ ------------------------------------------------------------------------- */
+
+#include "pair_lj_charmmfsw_coul_long_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "error.h"
+#include "force.h"
+#include "kokkos.h"
+#include "memory_kokkos.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor.h"
+#include "respa.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairLJCharmmfswCoulLongKokkos<DeviceType>::PairLJCharmmfswCoulLongKokkos(LAMMPS *lmp):PairLJCharmmfswCoulLong(lmp)
+{
+  respa_enable = 0;
+
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | F_MASK | TYPE_MASK | Q_MASK | ENERGY_MASK | VIRIAL_MASK;
+  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairLJCharmmfswCoulLongKokkos<DeviceType>::~PairLJCharmmfswCoulLongKokkos()
+{
+  if (copymode) return;
+
+  if (allocated) {
+    memoryKK->destroy_kokkos(k_eatom,eatom);
+    memoryKK->destroy_kokkos(k_vatom,vatom);
+    memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairLJCharmmfswCoulLongKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+  if (neighflag == FULL) no_virial_fdotr_compute = 1;
+
+  ev_init(eflag,vflag,0);
+
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    memoryKK->destroy_kokkos(k_eatom,eatom);
+    memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.view<DeviceType>();
+  }
+  if (vflag_atom) {
+    memoryKK->destroy_kokkos(k_vatom,vatom);
+    memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom");
+    d_vatom = k_vatom.view<DeviceType>();
+  }
+
+  atomKK->sync(execution_space,datamask_read);
+  k_cutsq.template sync<DeviceType>();
+  k_params.template sync<DeviceType>();
+  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
+  else atomKK->modified(execution_space,F_MASK);
+
+  x = atomKK->k_x.view<DeviceType>();
+  c_x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  q = atomKK->k_q.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  nlocal = atom->nlocal;
+  nall = atom->nlocal + atom->nghost;
+  special_lj[0] = force->special_lj[0];
+  special_lj[1] = force->special_lj[1];
+  special_lj[2] = force->special_lj[2];
+  special_lj[3] = force->special_lj[3];
+  special_coul[0] = force->special_coul[0];
+  special_coul[1] = force->special_coul[1];
+  special_coul[2] = force->special_coul[2];
+  special_coul[3] = force->special_coul[3];
+  qqrd2e = force->qqrd2e;
+  newton_pair = force->newton_pair;
+
+  // loop over neighbors of my atoms
+
+  copymode = 1;
+
+  EV_FLOAT ev;
+  if (ncoultablebits)
+    ev = pair_compute<PairLJCharmmfswCoulLongKokkos<DeviceType>,CoulLongTable<1> >
+      (this,(NeighListKokkos<DeviceType>*)list);
+  else
+    ev = pair_compute<PairLJCharmmfswCoulLongKokkos<DeviceType>,CoulLongTable<0> >
+      (this,(NeighListKokkos<DeviceType>*)list);
+
+
+  if (eflag) {
+    eng_vdwl += ev.evdwl;
+    eng_coul += ev.ecoul;
+  }
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_fdotr) pair_virial_fdotr_compute(this);
+
+  copymode = 0;
+}
+
+/* ----------------------------------------------------------------------
+   compute LJ CHARMM pair force between atoms i and j
+   ---------------------------------------------------------------------- */
+template<class DeviceType>
+template<bool STACKPARAMS, class Specialisation>
+KOKKOS_INLINE_FUNCTION
+F_FLOAT PairLJCharmmfswCoulLongKokkos<DeviceType>::
+compute_fpair(const F_FLOAT& rsq, const int& /*i*/, const int& /*j*/,
+              const int& itype, const int& jtype) const {
+  const F_FLOAT r2inv = 1.0/rsq;
+  const F_FLOAT r6inv = r2inv*r2inv*r2inv;
+  F_FLOAT forcelj, switch1;
+
+  forcelj = r6inv *
+    ((STACKPARAMS?m_params[itype][jtype].lj1:params(itype,jtype).lj1)*r6inv -
+     (STACKPARAMS?m_params[itype][jtype].lj2:params(itype,jtype).lj2));
+
+  if (rsq > cut_lj_innersq) {
+    switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+              (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) / denom_lj;
+    forcelj = forcelj*switch1;
+  }
+
+  return forcelj*r2inv;
+}
+
+/* ----------------------------------------------------------------------
+   compute LJ CHARMM pair potential energy between atoms i and j
+   ---------------------------------------------------------------------- */
+template<class DeviceType>
+template<bool STACKPARAMS, class Specialisation>
+KOKKOS_INLINE_FUNCTION
+F_FLOAT PairLJCharmmfswCoulLongKokkos<DeviceType>::
+compute_evdwl(const F_FLOAT& rsq, const int& /*i*/, const int& /*j*/,
+              const int& itype, const int& jtype) const {
+  const F_FLOAT r2inv = 1.0/rsq;
+  const F_FLOAT r6inv = r2inv*r2inv*r2inv;
+  const F_FLOAT r = sqrt(rsq);
+  const F_FLOAT rinv = 1.0/r;
+  const F_FLOAT r3inv = rinv*rinv*rinv;
+  F_FLOAT englj, englj12, englj6;
+
+  if (rsq > cut_lj_innersq) {
+    englj12 = (STACKPARAMS?m_params[itype][jtype].lj3:params(itype,jtype).lj3)*cut_lj6*
+      denom_lj12 * (r6inv - cut_lj6inv)*(r6inv - cut_lj6inv);
+    englj6 = -(STACKPARAMS?m_params[itype][jtype].lj4:params(itype,jtype).lj4)*
+      cut_lj3*denom_lj6 * (r3inv - cut_lj3inv)*(r3inv - cut_lj3inv);
+    englj = englj12 + englj6;
+  } else {
+    englj12 = r6inv*(STACKPARAMS?m_params[itype][jtype].lj3:params(itype,jtype).lj3)*r6inv -
+    (STACKPARAMS?m_params[itype][jtype].lj3:params(itype,jtype).lj3)*cut_lj_inner6inv*cut_lj6inv;
+    englj6 = -(STACKPARAMS?m_params[itype][jtype].lj4:params(itype,jtype).lj4)*r6inv +
+      (STACKPARAMS?m_params[itype][jtype].lj4:params(itype,jtype).lj4)*
+      cut_lj_inner3inv*cut_lj3inv;
+    englj = englj12 + englj6;
+  }
+  return englj;
+}
+
+/* ----------------------------------------------------------------------
+   compute coulomb pair force between atoms i and j
+   ---------------------------------------------------------------------- */
+template<class DeviceType>
+template<bool STACKPARAMS,  class Specialisation>
+KOKKOS_INLINE_FUNCTION
+F_FLOAT PairLJCharmmfswCoulLongKokkos<DeviceType>::
+compute_fcoul(const F_FLOAT& rsq, const int& /*i*/, const int&j,
+              const int& /*itype*/, const int& /*jtype*/,
+              const F_FLOAT& factor_coul, const F_FLOAT& qtmp) const {
+  if (Specialisation::DoTable && rsq > tabinnersq) {
+    union_int_float_t rsq_lookup;
+    rsq_lookup.f = rsq;
+    const int itable = (rsq_lookup.i & ncoulmask) >> ncoulshiftbits;
+    const F_FLOAT fraction = (rsq_lookup.f - d_rtable[itable]) * d_drtable[itable];
+    const F_FLOAT table = d_ftable[itable] + fraction*d_dftable[itable];
+    F_FLOAT forcecoul = qtmp*q[j] * table;
+    if (factor_coul < 1.0) {
+      const F_FLOAT table = d_ctable[itable] + fraction*d_dctable[itable];
+      const F_FLOAT prefactor = qtmp*q[j] * table;
+      forcecoul -= (1.0-factor_coul)*prefactor;
+    }
+    return forcecoul/rsq;
+  } else {
+    const F_FLOAT r = sqrt(rsq);
+    const F_FLOAT grij = g_ewald * r;
+    const F_FLOAT expm2 = exp(-grij*grij);
+    const F_FLOAT t = 1.0 / (1.0 + EWALD_P*grij);
+    const F_FLOAT rinv = 1.0/r;
+    const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+    const F_FLOAT prefactor = qqrd2e * qtmp*q[j]*rinv;
+    F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+
+    return forcecoul*rinv*rinv;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute coulomb pair potential energy between atoms i and j
+   ---------------------------------------------------------------------- */
+template<class DeviceType>
+template<bool STACKPARAMS, class Specialisation>
+KOKKOS_INLINE_FUNCTION
+F_FLOAT PairLJCharmmfswCoulLongKokkos<DeviceType>::
+compute_ecoul(const F_FLOAT& rsq, const int& /*i*/, const int&j,
+              const int& /*itype*/, const int& /*jtype*/, const F_FLOAT& factor_coul, const F_FLOAT& qtmp) const {
+  if (Specialisation::DoTable && rsq > tabinnersq) {
+    union_int_float_t rsq_lookup;
+    rsq_lookup.f = rsq;
+    const int itable = (rsq_lookup.i & ncoulmask) >> ncoulshiftbits;
+    const F_FLOAT fraction = (rsq_lookup.f - d_rtable[itable]) * d_drtable[itable];
+    const F_FLOAT table = d_etable[itable] + fraction*d_detable[itable];
+    F_FLOAT ecoul = qtmp*q[j] * table;
+    if (factor_coul < 1.0) {
+      const F_FLOAT table = d_ctable[itable] + fraction*d_dctable[itable];
+      const F_FLOAT prefactor = qtmp*q[j] * table;
+      ecoul -= (1.0-factor_coul)*prefactor;
+    }
+    return ecoul;
+  } else {
+    const F_FLOAT r = sqrt(rsq);
+    const F_FLOAT grij = g_ewald * r;
+    const F_FLOAT expm2 = exp(-grij*grij);
+    const F_FLOAT t = 1.0 / (1.0 + EWALD_P*grij);
+    const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+    const F_FLOAT prefactor = qqrd2e * qtmp*q[j]/r;
+    F_FLOAT ecoul = prefactor * erfc;
+    if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+    return ecoul;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairLJCharmmfswCoulLongKokkos<DeviceType>::allocate()
+{
+  PairLJCharmmfswCoulLong::allocate();
+
+  int n = atom->ntypes;
+
+  memory->destroy(cutsq);
+  memoryKK->create_kokkos(k_cutsq,cutsq,n+1,n+1,"pair:cutsq");
+  d_cutsq = k_cutsq.template view<DeviceType>();
+
+  d_cut_ljsq = typename AT::t_ffloat_2d("pair:cut_ljsq",n+1,n+1);
+
+  d_cut_coulsq = typename AT::t_ffloat_2d("pair:cut_coulsq",n+1,n+1);
+
+  k_params = Kokkos::DualView<params_lj_coul**,Kokkos::LayoutRight,DeviceType>("PairLJCharmmfswCoulLong::params",n+1,n+1);
+  params = k_params.template view<DeviceType>();
+}
+
+template<class DeviceType>
+void PairLJCharmmfswCoulLongKokkos<DeviceType>::init_tables(double cut_coul, double *cut_respa)
+{
+  Pair::init_tables(cut_coul,cut_respa);
+
+  typedef typename ArrayTypes<DeviceType>::t_ffloat_1d table_type;
+  typedef typename ArrayTypes<LMPHostType>::t_ffloat_1d host_table_type;
+
+  int ntable = 1;
+  for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
+
+
+  // Copy rtable and drtable
+  {
+  host_table_type h_table("HostTable",ntable);
+  table_type d_table("DeviceTable",ntable);
+  for (int i = 0; i < ntable; i++) {
+    h_table(i) = rtable[i];
+  }
+  Kokkos::deep_copy(d_table,h_table);
+  d_rtable = d_table;
+  }
+
+  {
+  host_table_type h_table("HostTable",ntable);
+  table_type d_table("DeviceTable",ntable);
+  for (int i = 0; i < ntable; i++) {
+    h_table(i) = drtable[i];
+  }
+  Kokkos::deep_copy(d_table,h_table);
+  d_drtable = d_table;
+  }
+
+  {
+  host_table_type h_table("HostTable",ntable);
+  table_type d_table("DeviceTable",ntable);
+
+  // Copy ftable and dftable
+  for (int i = 0; i < ntable; i++) {
+    h_table(i) = ftable[i];
+  }
+  Kokkos::deep_copy(d_table,h_table);
+  d_ftable = d_table;
+  }
+
+  {
+  host_table_type h_table("HostTable",ntable);
+  table_type d_table("DeviceTable",ntable);
+
+  for (int i = 0; i < ntable; i++) {
+    h_table(i) = dftable[i];
+  }
+  Kokkos::deep_copy(d_table,h_table);
+  d_dftable = d_table;
+  }
+
+  {
+  host_table_type h_table("HostTable",ntable);
+  table_type d_table("DeviceTable",ntable);
+
+  // Copy ctable and dctable
+  for (int i = 0; i < ntable; i++) {
+    h_table(i) = ctable[i];
+  }
+  Kokkos::deep_copy(d_table,h_table);
+  d_ctable = d_table;
+  }
+
+  {
+  host_table_type h_table("HostTable",ntable);
+  table_type d_table("DeviceTable",ntable);
+
+  for (int i = 0; i < ntable; i++) {
+    h_table(i) = dctable[i];
+  }
+  Kokkos::deep_copy(d_table,h_table);
+  d_dctable = d_table;
+  }
+
+  {
+  host_table_type h_table("HostTable",ntable);
+  table_type d_table("DeviceTable",ntable);
+
+  // Copy etable and detable
+  for (int i = 0; i < ntable; i++) {
+    h_table(i) = etable[i];
+  }
+  Kokkos::deep_copy(d_table,h_table);
+  d_etable = d_table;
+  }
+
+  {
+  host_table_type h_table("HostTable",ntable);
+  table_type d_table("DeviceTable",ntable);
+
+  for (int i = 0; i < ntable; i++) {
+    h_table(i) = detable[i];
+  }
+  Kokkos::deep_copy(d_table,h_table);
+  d_detable = d_table;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairLJCharmmfswCoulLongKokkos<DeviceType>::init_style()
+{
+  PairLJCharmmfswCoulLong::init_style();
+
+  Kokkos::deep_copy(d_cut_ljsq,cut_ljsq);
+  Kokkos::deep_copy(d_cut_coulsq,cut_coulsq);
+
+  // error if rRESPA with inner levels
+
+  if (update->whichflag == 1 && utils::strmatch(update->integrate_style,"^respa")) {
+    int respa = 0;
+    if (((Respa *) update->integrate)->level_inner >= 0) respa = 1;
+    if (((Respa *) update->integrate)->level_middle >= 0) respa = 2;
+    if (respa)
+      error->all(FLERR,"Cannot use Kokkos pair style with rRESPA inner/middle");
+  }
+
+  // adjust neighbor list request for KOKKOS
+
+  neighflag = lmp->kokkos->neighflag;
+  auto request = neighbor->find_request(this);
+  request->set_kokkos_host(std::is_same_v<DeviceType,LMPHostType> &&
+                           !std::is_same_v<DeviceType,LMPDeviceType>);
+  request->set_kokkos_device(std::is_same_v<DeviceType,LMPDeviceType>);
+  if (neighflag == FULL) request->enable_full();
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairLJCharmmfswCoulLongKokkos<DeviceType>::init_one(int i, int j)
+{
+  double cutone = PairLJCharmmfswCoulLong::init_one(i,j);
+
+  k_params.h_view(i,j).lj1 = lj1[i][j];
+  k_params.h_view(i,j).lj2 = lj2[i][j];
+  k_params.h_view(i,j).lj3 = lj3[i][j];
+  k_params.h_view(i,j).lj4 = lj4[i][j];
+  //k_params.h_view(i,j).offset = offset[i][j];
+  k_params.h_view(i,j).cut_ljsq = cut_ljsq;
+  k_params.h_view(i,j).cut_coulsq = cut_coulsq;
+
+  k_params.h_view(j,i) = k_params.h_view(i,j);
+  if (i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) {
+    m_params[i][j] = m_params[j][i] = k_params.h_view(i,j);
+    m_cutsq[j][i] = m_cutsq[i][j] = cutone*cutone;
+    m_cut_ljsq[j][i] = m_cut_ljsq[i][j] = cut_ljsq;
+    m_cut_coulsq[j][i] = m_cut_coulsq[i][j] = cut_coulsq;
+  }
+
+  k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone;
+  k_cutsq.template modify<LMPHostType>();
+  k_params.template modify<LMPHostType>();
+
+  return cutone;
+}
+
+namespace LAMMPS_NS {
+template class PairLJCharmmfswCoulLongKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class PairLJCharmmfswCoulLongKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/pair_lj_charmmfsw_coul_long_kokkos.h b/src/KOKKOS/pair_lj_charmmfsw_coul_long_kokkos.h
new file mode 100644
index 0000000000..7533f40dbc
--- /dev/null
+++ b/src/KOKKOS/pair_lj_charmmfsw_coul_long_kokkos.h
@@ -0,0 +1,145 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(lj/charmmfsw/coul/long/kk,PairLJCharmmfswCoulLongKokkos<LMPDeviceType>);
+PairStyle(lj/charmmfsw/coul/long/kk/device,PairLJCharmmfswCoulLongKokkos<LMPDeviceType>);
+PairStyle(lj/charmmfsw/coul/long/kk/host,PairLJCharmmfswCoulLongKokkos<LMPHostType>);
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_PAIR_LJ_CHARMMFSW_COUL_LONG_KOKKOS_H
+#define LMP_PAIR_LJ_CHARMMFSW_COUL_LONG_KOKKOS_H
+
+#include "pair_kokkos.h"
+#include "pair_lj_charmmfsw_coul_long.h"
+#include "neigh_list_kokkos.h"
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class PairLJCharmmfswCoulLongKokkos : public PairLJCharmmfswCoulLong {
+ public:
+  enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF};
+  enum {COUL_FLAG=1};
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  PairLJCharmmfswCoulLongKokkos(class LAMMPS *);
+  ~PairLJCharmmfswCoulLongKokkos() override;
+
+  void compute(int, int) override;
+
+  void init_tables(double cut_coul, double *cut_respa) override;
+  void init_style() override;
+  double init_one(int, int) override;
+
+ protected:
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_fpair(const F_FLOAT& rsq, const int& i, const int&j,
+                        const int& itype, const int& jtype) const;
+
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_fcoul(const F_FLOAT& rsq, const int& i, const int&j, const int& itype,
+                        const int& jtype, const F_FLOAT& factor_coul, const F_FLOAT& qtmp) const;
+
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j,
+                        const int& itype, const int& jtype) const;
+
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_ecoul(const F_FLOAT& rsq, const int& i, const int&j,
+                        const int& itype, const int& jtype, const F_FLOAT& factor_coul, const F_FLOAT& qtmp) const;
+
+  Kokkos::DualView<params_lj_coul**,Kokkos::LayoutRight,DeviceType> k_params;
+  typename Kokkos::DualView<params_lj_coul**,
+    Kokkos::LayoutRight,DeviceType>::t_dev_const_um params;
+  // hardwired to space for 12 atom types
+  params_lj_coul m_params[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+
+  F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+  F_FLOAT m_cut_ljsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+  F_FLOAT m_cut_coulsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+  typename AT::t_x_array_randomread x;
+  typename AT::t_x_array c_x;
+  typename AT::t_f_array f;
+  typename AT::t_int_1d_randomread type;
+  typename AT::t_float_1d_randomread q;
+
+  DAT::tdual_efloat_1d k_eatom;
+  DAT::tdual_virial_array k_vatom;
+  typename AT::t_efloat_1d d_eatom;
+  typename AT::t_virial_array d_vatom;
+
+  int newton_pair;
+
+  typename AT::tdual_ffloat_2d k_cutsq;
+  typename AT::t_ffloat_2d d_cutsq;
+  typename AT::t_ffloat_2d d_cut_ljsq;
+  typename AT::t_ffloat_2d d_cut_coulsq;
+
+  typename AT::t_ffloat_1d_randomread
+    d_rtable, d_drtable, d_ftable, d_dftable,
+    d_ctable, d_dctable, d_etable, d_detable;
+
+  int neighflag;
+  int nlocal,nall,eflag,vflag;
+
+  double special_coul[4];
+  double special_lj[4];
+  double qqrd2e;
+
+  void allocate() override;
+
+  friend struct PairComputeFunctor<PairLJCharmmfswCoulLongKokkos,FULL,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmfswCoulLongKokkos,FULL,true,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmfswCoulLongKokkos,HALF,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmfswCoulLongKokkos,HALFTHREAD,true,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmfswCoulLongKokkos,FULL,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmfswCoulLongKokkos,FULL,false,1,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmfswCoulLongKokkos,HALF,false,0,CoulLongTable<1>>;
+  friend struct PairComputeFunctor<PairLJCharmmfswCoulLongKokkos,HALFTHREAD,false,0,CoulLongTable<1>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmfswCoulLongKokkos,FULL,0,CoulLongTable<1>>(PairLJCharmmfswCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmfswCoulLongKokkos,FULL,1,CoulLongTable<1>>(PairLJCharmmfswCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmfswCoulLongKokkos,HALF,0,CoulLongTable<1>>(PairLJCharmmfswCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmfswCoulLongKokkos,HALFTHREAD,0,CoulLongTable<1>>(PairLJCharmmfswCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJCharmmfswCoulLongKokkos,CoulLongTable<1>>(PairLJCharmmfswCoulLongKokkos*,
+                                                            NeighListKokkos<DeviceType>*);
+  friend struct PairComputeFunctor<PairLJCharmmfswCoulLongKokkos,FULL,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmfswCoulLongKokkos,FULL,true,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmfswCoulLongKokkos,HALF,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmfswCoulLongKokkos,HALFTHREAD,true,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmfswCoulLongKokkos,FULL,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmfswCoulLongKokkos,FULL,false,1,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmfswCoulLongKokkos,HALF,false,0,CoulLongTable<0>>;
+  friend struct PairComputeFunctor<PairLJCharmmfswCoulLongKokkos,HALFTHREAD,false,0,CoulLongTable<0>>;
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmfswCoulLongKokkos,FULL,0,CoulLongTable<0>>(PairLJCharmmfswCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmfswCoulLongKokkos,FULL,1,CoulLongTable<0>>(PairLJCharmmfswCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmfswCoulLongKokkos,HALF,0,CoulLongTable<0>>(PairLJCharmmfswCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute_neighlist<PairLJCharmmfswCoulLongKokkos,HALFTHREAD,0,CoulLongTable<0>>(PairLJCharmmfswCoulLongKokkos*,NeighListKokkos<DeviceType>*);
+  friend EV_FLOAT pair_compute<PairLJCharmmfswCoulLongKokkos,CoulLongTable<0>>(PairLJCharmmfswCoulLongKokkos*,
+                                                            NeighListKokkos<DeviceType>*);
+  friend void pair_virial_fdotr_compute<PairLJCharmmfswCoulLongKokkos>(PairLJCharmmfswCoulLongKokkos*);
+
+};
+
+}
+
+#endif
+#endif
+
diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
index 0980ad776d..ef747ef95c 100644
--- a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
+++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
@@ -106,7 +106,8 @@ void PairPACEExtrapolationKokkos<DeviceType>::grow(int natom, int maxneigh)
 
   if ((int)A.extent(0) < natom) {
 
-    MemKK::realloc_kokkos(A, "pace:A", natom, nelements, nradmax + 1, (lmax + 1) * (lmax + 1));
+    MemKK::realloc_kokkos(A_sph, "pace:A_sph", natom, nelements, idx_sph_max, nradmax + 1);
+    MemKK::realloc_kokkos(A, "pace:A", natom, nelements, (lmax + 1) * (lmax + 1), nradmax + 1);
     MemKK::realloc_kokkos(A_rank1, "pace:A_rank1", natom, nelements, nradbase);
 
     MemKK::realloc_kokkos(A_list, "pace:A_list", natom, idx_ms_combs_max, basis_set->rankmax);
@@ -117,7 +118,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::grow(int natom, int maxneigh)
     MemKK::realloc_kokkos(rhos, "pace:rhos", natom, basis_set->ndensitymax + 1); // +1 density for core repulsion
     MemKK::realloc_kokkos(dF_drho, "pace:dF_drho", natom, basis_set->ndensitymax + 1); // +1 density for core repulsion
 
-    MemKK::realloc_kokkos(weights, "pace:weights", natom, nelements, nradmax + 1, (lmax + 1) * (lmax + 1));
+    MemKK::realloc_kokkos(weights, "pace:weights", natom, nelements, idx_sph_max, nradmax + 1);
     MemKK::realloc_kokkos(weights_rank1, "pace:weights_rank1", natom, nelements, nradbase);
 
     // hard-core repulsion
@@ -130,16 +131,16 @@ void PairPACEExtrapolationKokkos<DeviceType>::grow(int natom, int maxneigh)
 
     MemKK::realloc_kokkos(dB_flatten, "pace:dB_flatten", natom, idx_ms_combs_max, basis_set->rankmax);
 
-    //B-projections
+    // B-projections
     MemKK::realloc_kokkos(projections, "pace:projections", natom, total_num_functions_max); // per-atom B-projections
     MemKK::realloc_kokkos(d_gamma, "pace:gamma", natom); // per-atom gamma
   }
 
-  if (((int)ylm.extent(0) < natom) || ((int)ylm.extent(1) < maxneigh)) {
+  if (((int)fr.extent(0) < natom) || ((int)fr.extent(1) < maxneigh)) {
 
     // radial functions
-    MemKK::realloc_kokkos(fr, "pace:fr", natom, maxneigh, nradmax, lmax + 1);
-    MemKK::realloc_kokkos(dfr, "pace:dfr", natom, maxneigh, nradmax, lmax + 1);
+    MemKK::realloc_kokkos(fr, "pace:fr", natom, maxneigh, lmax + 1, nradmax);
+    MemKK::realloc_kokkos(dfr, "pace:dfr", natom, maxneigh, lmax + 1, nradmax);
     MemKK::realloc_kokkos(gr, "pace:gr", natom, maxneigh, nradbase);
     MemKK::realloc_kokkos(dgr, "pace:dgr", natom, maxneigh, nradbase);
     const int max_num_functions = MAX(nradbase, nradmax*(lmax + 1));
@@ -150,12 +151,6 @@ void PairPACEExtrapolationKokkos<DeviceType>::grow(int natom, int maxneigh)
     MemKK::realloc_kokkos(cr, "pace:cr", natom, maxneigh);
     MemKK::realloc_kokkos(dcr, "pace:dcr", natom, maxneigh);
 
-    // spherical harmonics
-    MemKK::realloc_kokkos(plm, "pace:plm", natom, maxneigh, (lmax + 1) * (lmax + 1));
-    MemKK::realloc_kokkos(dplm, "pace:dplm", natom, maxneigh, (lmax + 1) * (lmax + 1));
-    MemKK::realloc_kokkos(ylm, "pace:ylm", natom, maxneigh, (lmax + 1) * (lmax + 1));
-    MemKK::realloc_kokkos(dylm, "pace:dylm", natom, maxneigh, (lmax + 1) * (lmax + 1));
-
     // short neigh list
     MemKK::realloc_kokkos(d_ncount, "pace:ncount", natom);
     MemKK::realloc_kokkos(d_mu, "pace:mu", natom, maxneigh);
@@ -224,7 +219,6 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_pertype()
   Kokkos::deep_copy(d_wpre, h_wpre);
   Kokkos::deep_copy(d_mexp, h_mexp);
 
-
   // ZBL core-rep
   MemKK::realloc_kokkos(d_cut_in, "pace:d_cut_in", nelements, nelements);
   MemKK::realloc_kokkos(d_dcut_in, "pace:d_dcut_in", nelements, nelements);
@@ -266,6 +260,9 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_splines()
 
   ACERadialFunctions* radial_functions = dynamic_cast<ACERadialFunctions*>(basis_set->radial_functions);
 
+  if (radial_functions == nullptr)
+    error->all(FLERR,"Chosen radial basis style not supported by pair style pace/kk");
+
   for (int i = 0; i < nelements; i++) {
     for (int j = 0; j < nelements; j++) {
       k_splines_gk.h_view(i, j) = radial_functions->splines_gk(i, j);
@@ -297,8 +294,9 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
   total_num_functions_max = 0;
 
   MemKK::realloc_kokkos(d_idx_ms_combs_count, "pace:idx_ms_combs_count", nelements);
-  MemKK::realloc_kokkos(d_total_basis_size, "pace:total_basis_size", nelements);
   auto h_idx_ms_combs_count = Kokkos::create_mirror_view(d_idx_ms_combs_count);
+
+  MemKK::realloc_kokkos(d_total_basis_size, "pace:total_basis_size", nelements);
   auto h_total_basis_size = Kokkos::create_mirror_view(d_total_basis_size);
 
   for (int mu = 0; mu < nelements; mu++) {
@@ -313,8 +311,8 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
       idx_ms_combs++;
 
     // rank > 1
-    for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) {
-      ACEBBasisFunction *func = &basis[func_ind];
+    for (int idx_func = 0; idx_func < total_basis_size; ++idx_func) {
+      ACEBBasisFunction *func = &basis[idx_func];
 
       // loop over {ms} combinations in sum
       for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind)
@@ -331,7 +329,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
 
   MemKK::realloc_kokkos(d_rank, "pace:rank", nelements, total_num_functions_max);
   MemKK::realloc_kokkos(d_num_ms_combs, "pace:num_ms_combs", nelements, total_num_functions_max);
-  MemKK::realloc_kokkos(d_func_inds, "pace:func_inds", nelements, idx_ms_combs_max);
+  MemKK::realloc_kokkos(d_idx_funcs, "pace:idx_funcs", nelements, idx_ms_combs_max);
   MemKK::realloc_kokkos(d_mus, "pace:mus", nelements, total_num_functions_max, basis_set->rankmax);
   MemKK::realloc_kokkos(d_ns, "pace:ns", nelements, total_num_functions_max, basis_set->rankmax);
   MemKK::realloc_kokkos(d_ls, "pace:ls", nelements, total_num_functions_max, basis_set->rankmax);
@@ -344,7 +342,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
 
   auto h_rank = Kokkos::create_mirror_view(d_rank);
   auto h_num_ms_combs = Kokkos::create_mirror_view(d_num_ms_combs);
-  auto h_func_inds = Kokkos::create_mirror_view(d_func_inds);
+  auto h_idx_funcs = Kokkos::create_mirror_view(d_idx_funcs);
   auto h_mus = Kokkos::create_mirror_view(d_mus);
   auto h_ns = Kokkos::create_mirror_view(d_ns);
   auto h_ls = Kokkos::create_mirror_view(d_ls);
@@ -365,55 +363,52 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
 
     const int ndensity = basis_set->map_embedding_specifications.at(mu).ndensity;
 
-    int idx_ms_comb = 0;
+    int idx_ms_combs = 0;
 
     // rank=1
-    for (int func_ind = 0; func_ind < total_basis_size_rank1; ++func_ind) {
-      ACEBBasisFunction *func = &basis_rank1[func_ind];
-      h_rank(mu, func_ind) = 1;
-      h_mus(mu, func_ind, 0) = func->mus[0];
-      h_ns(mu, func_ind, 0) = func->ns[0];
+    for (int idx_func = 0; idx_func < total_basis_size_rank1; ++idx_func) {
+      ACEBBasisFunction *func = &basis_rank1[idx_func];
+      h_rank(mu, idx_func) = 1;
+      h_mus(mu, idx_func, 0) = func->mus[0];
+      h_ns(mu, idx_func, 0) = func->ns[0];
 
       for (int p = 0; p < ndensity; ++p)
-            h_coeffs(mu, func_ind, p) = func->coeff[p];
+        h_coeffs(mu, idx_func, p) = func->coeff[p];
 
-      h_gen_cgs(mu, idx_ms_comb) = func->gen_cgs[0];
+      h_gen_cgs(mu, idx_ms_combs) = func->gen_cgs[0];
 
-      h_func_inds(mu, idx_ms_comb) = func_ind;
-      idx_ms_comb++;
+      h_idx_funcs(mu, idx_ms_combs) = idx_func;
+      idx_ms_combs++;
     }
 
     // rank > 1
-    for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) {
-      ACEBBasisFunction *func = &basis[func_ind];
+    for (int idx_func = 0; idx_func < total_basis_size; ++idx_func) {
+      ACEBBasisFunction *func = &basis[idx_func];
       // TODO: check if func->ctildes are zero, then skip
 
-      const int func_ind_through = total_basis_size_rank1 + func_ind;
+      const int idx_func_through = total_basis_size_rank1 + idx_func;
 
-      const int rank = h_rank(mu, func_ind_through) = func->rank;
-      h_num_ms_combs(mu, func_ind_through) = func->num_ms_combs;
+      const int rank = h_rank(mu, idx_func_through) = func->rank;
+      h_num_ms_combs(mu, idx_func_through) = func->num_ms_combs;
       for (int t = 0; t < rank; t++) {
-        h_mus(mu, func_ind_through, t) = func->mus[t];
-        h_ns(mu, func_ind_through, t) = func->ns[t];
-        h_ls(mu, func_ind_through, t) = func->ls[t];
+        h_mus(mu, idx_func_through, t) = func->mus[t];
+        h_ns(mu, idx_func_through, t) = func->ns[t];
+        h_ls(mu, idx_func_through, t) = func->ls[t];
       }
 
       for (int p = 0; p < ndensity; ++p)
-        h_coeffs(mu, func_ind_through, p) = func->coeff[p];
-
+        h_coeffs(mu, idx_func_through, p) = func->coeff[p];
 
       // loop over {ms} combinations in sum
       for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind) {
         auto ms = &func->ms_combs[ms_ind * rank]; // current ms-combination (of length = rank)
         for (int t = 0; t < rank; t++)
-          h_ms_combs(mu, idx_ms_comb, t) = ms[t];
+          h_ms_combs(mu, idx_ms_combs, t) = ms[t];
 
+        h_gen_cgs(mu, idx_ms_combs) = func->gen_cgs[ms_ind];
 
-        h_gen_cgs(mu, idx_ms_comb) = func->gen_cgs[ms_ind];
-
-
-        h_func_inds(mu, idx_ms_comb) = func_ind_through;
-        idx_ms_comb++;
+        h_idx_funcs(mu, idx_ms_combs) = idx_func_through;
+        idx_ms_combs++;
       }
     }
 
@@ -427,7 +422,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
 
   Kokkos::deep_copy(d_rank, h_rank);
   Kokkos::deep_copy(d_num_ms_combs, h_num_ms_combs);
-  Kokkos::deep_copy(d_func_inds, h_func_inds);
+  Kokkos::deep_copy(d_idx_funcs, h_idx_funcs);
   Kokkos::deep_copy(d_mus, h_mus);
   Kokkos::deep_copy(d_ns, h_ns);
   Kokkos::deep_copy(d_ls, h_ls);
@@ -477,6 +472,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::init_style()
 
   // spherical harmonics
 
+  MemKK::realloc_kokkos(d_idx_sph, "pace:idx_sph", (lmax + 1) * (lmax + 1));
   MemKK::realloc_kokkos(alm, "pace:alm", (lmax + 1) * (lmax + 1));
   MemKK::realloc_kokkos(blm, "pace:blm", (lmax + 1) * (lmax + 1));
   MemKK::realloc_kokkos(cl, "pace:cl", lmax + 1);
@@ -575,6 +571,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
     atomKK->modified(Host,F_MASK);
     return;
   }
+
   eflag = eflag_in;
   vflag = vflag_in;
 
@@ -602,6 +599,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
         //zeroify array
         memset(extrapolation_grade_gamma, 0, nmax * sizeof(*extrapolation_grade_gamma));
   }
+
   if (flag_corerep_factor && atom->nlocal > nmax_corerep) {
     memory->destroy(corerep_factor);
     nmax_corerep = atom->nlocal;
@@ -647,7 +645,6 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
   chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user
   chunk_offset = 0;
 
-
   grow(chunk_size, maxneigh);
 
   EV_FLOAT ev;
@@ -656,14 +653,12 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
 
     Kokkos::deep_copy(weights, 0.0);
     Kokkos::deep_copy(weights_rank1, 0.0);
-    Kokkos::deep_copy(A, 0.0);
+    Kokkos::deep_copy(A_sph, 0.0);
     Kokkos::deep_copy(A_rank1, 0.0);
     Kokkos::deep_copy(rhos, 0.0);
-
     Kokkos::deep_copy(rho_core, 0.0);
     Kokkos::deep_copy(d_d_min, PairPACEExtrapolation::aceimpl->basis_set->cutoffmax);
     Kokkos::deep_copy(d_jj_min, -1);
-
     Kokkos::deep_copy(projections, 0.0);
     Kokkos::deep_copy(d_gamma, 0.0);
     Kokkos::deep_copy(d_corerep, 0.0);
@@ -693,15 +688,6 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
       Kokkos::parallel_for("ComputeRadial",policy_radial,*this);
     }
 
-    //ComputeYlm
-    {
-      int vector_length = vector_length_default;
-      int team_size = 16;
-      check_team_size_for<TagPairPACEComputeYlm>(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
-      typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeYlm> policy_ylm(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
-      Kokkos::parallel_for("ComputeYlm",policy_ylm,*this);
-    }
-
     //ComputeAi
     {
       int vector_length = vector_length_default;
@@ -737,7 +723,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
 
     //ComputeWeights
     {
-      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeWeights> policy_weights(0, chunk_size * idx_ms_combs_max);
+      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeWeights> policy_weights(0,chunk_size * idx_ms_combs_max);
       Kokkos::parallel_for("ComputeWeights",policy_weights,*this);
     }
 
@@ -746,7 +732,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
       int vector_length = vector_length_default;
       int team_size = team_size_default;
       check_team_size_for<TagPairPACEComputeDerivative>(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
-      typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeDerivative> policy_derivative(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      typename Kokkos::TeamPolicy<DeviceType,TagPairPACEComputeDerivative> policy_derivative(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
       Kokkos::parallel_for("ComputeDerivative",policy_derivative,*this);
     }
 
@@ -772,16 +758,18 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
     }
     ev += ev_tmp;
 
-    //if flag_compute_extrapolation_grade - copy current d_gamma to extrapolation_grade_gamma
+    // if flag_compute_extrapolation_grade - copy current d_gamma to extrapolation_grade_gamma
+
     if (flag_compute_extrapolation_grade){
         h_gamma = Kokkos::create_mirror_view(d_gamma);
         Kokkos::deep_copy(h_gamma, d_gamma);
         memcpy(extrapolation_grade_gamma+chunk_offset, (void *) h_gamma.data(), sizeof(double)*chunk_size);
     }
+
     if (flag_corerep_factor) {
-        h_corerep = Kokkos::create_mirror_view(d_corerep);
-        Kokkos::deep_copy(h_corerep,d_corerep);
-        memcpy(corerep_factor+chunk_offset, (void *) h_corerep.data(), sizeof(double)*chunk_size);
+      h_corerep = Kokkos::create_mirror_view(d_corerep);
+      Kokkos::deep_copy(h_corerep,d_corerep);
+      memcpy(corerep_factor+chunk_offset, (void *) h_corerep.data(), sizeof(double)*chunk_size);
     }
 
     chunk_offset += chunk_size;
@@ -909,18 +897,17 @@ void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeNeig
       Kokkos::MinLoc<F_FLOAT,int> reducer_scalar(djjmin);
       // loop over ncount (actual neighbours withing cutoff) rather than jnum (total number of neigh in cutoff+skin)
       Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, ncount),
-                              [&](const int offset, minloc_value_type &min_d_dist) {
-                                int j = d_nearest(ii,offset);
-                                j &= NEIGHMASK;
-                                const int jtype = type(j);
-                                auto r = d_rnorms(ii,offset);
-                                const int mu_j = d_map(type(j));
-                                const F_FLOAT d = r - (d_cut_in(mu_i, mu_j) - d_dcut_in(mu_i, mu_j));
-                                if (d < min_d_dist.val) {
-                                  min_d_dist.val = d;
-                                  min_d_dist.loc = offset;
-                                }
-                              }, reducer_scalar);
+               [&](const int offset, minloc_value_type &min_d_dist) {
+                 int j = d_nearest(ii,offset);
+                 j &= NEIGHMASK;
+                 auto r = d_rnorms(ii,offset);
+                 const int mu_j = d_map(type(j));
+                 const F_FLOAT d = r - (d_cut_in(mu_i, mu_j) - d_dcut_in(mu_i, mu_j));
+                 if (d < min_d_dist.val) {
+                   min_d_dist.val = d;
+                   min_d_dist.loc = offset;
+                 }
+       }, reducer_scalar);
       d_d_min(ii) = djjmin.val;
       d_jj_min(ii) = djjmin.loc;// d_jj_min should be NOT in 0..jnum range, but in 0..d_ncount(<=jnum)
     } else {
@@ -956,28 +943,6 @@ void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeRadi
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeYlm, const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeYlm>::member_type& team) const
-{
-  // Extract the atom number
-  int ii = team.team_rank() + team.team_size() * (team.league_rank() %
-           ((chunk_size+team.team_size()-1)/team.team_size()));
-  if (ii >= chunk_size) return;
-
-  // Extract the neighbor number
-  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
-  const int ncount = d_ncount(ii);
-  if (jj >= ncount) return;
-
-  const double xn = d_rhats(ii, jj, 0);
-  const double yn = d_rhats(ii, jj, 1);
-  const double zn = d_rhats(ii, jj, 2);
-  compute_ylm(ii,jj,xn,yn,zn,lmax);
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeAi, const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeAi>::member_type& team) const
@@ -999,13 +964,127 @@ void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeAi,
     Kokkos::atomic_add(&A_rank1(ii, mu_j, n), gr(ii, jj, n) * Y00);
 
   // rank > 1
-  for (int n = 0; n < nradmax; n++) {
-    for (int l = 0; l <= lmax; l++) {
-      for (int m = 0; m <= l; m++) {
-        const int idx = l * (l + 1) + m; // (l, m)
-        Kokkos::atomic_add(&A(ii, mu_j, n, idx).re, fr(ii, jj, n, l) * ylm(ii, jj, idx).re);
-        Kokkos::atomic_add(&A(ii, mu_j, n, idx).im, fr(ii, jj, n, l) * ylm(ii, jj, idx).im);
+
+  // Compute plm and ylm
+
+  // requires rx^2 + ry^2 + rz^2 = 1 , NO CHECKING IS PERFORMED !!!!!!!!!
+  // requires -1 <= rz <= 1 , NO CHECKING IS PERFORMED !!!!!!!!!
+  // prefactors include 1/sqrt(2) factor compared to reference
+
+  complex ylm, phase;
+  complex phasem, mphasem1;
+  complex dyx, dyy, dyz;
+  complex rdy;
+
+  const double rx = d_rhats(ii, jj, 0);
+  const double ry = d_rhats(ii, jj, 1);
+  const double rz = d_rhats(ii, jj, 2);
+
+  phase.re = rx;
+  phase.im = ry;
+
+  double plm_idx,plm_idx1,plm_idx2;
+
+  plm_idx = plm_idx1 = plm_idx2 = 0.0;
+
+  int idx_sph = 0;
+
+  // m = 0
+  for (int l = 0; l <= lmax; l++) {
+    // const int idx = l * (l + 1);
+
+    if (l == 0) {
+      // l=0, m=0
+      // plm[0] = Y00/sq1o4pi; //= sq1o4pi;
+      plm_idx = Y00; //= 1;
+    } else if (l == 1) {
+      // l=1, m=0
+      plm_idx = Y00 * sq3 * rz;
+    } else {
+      // l>=2, m=0
+      plm_idx = alm(idx_sph) * (rz * plm_idx1 + blm(idx_sph) * plm_idx2);
+    }
+
+    ylm.re = plm_idx;
+    ylm.im = 0.0;
+
+    for (int n = 0; n < nradmax; n++) {
+      Kokkos::atomic_add(&A_sph(ii, mu_j, idx_sph, n).re, fr(ii, jj, l, n) * ylm.re);
+      Kokkos::atomic_add(&A_sph(ii, mu_j, idx_sph, n).im, fr(ii, jj, l, n) * ylm.im);
+    }
+
+    plm_idx2 = plm_idx1;
+    plm_idx1 = plm_idx;
+
+    idx_sph++;
+  }
+
+  plm_idx = plm_idx1 = plm_idx2 = 0.0;
+
+  // m = 1
+  for (int l = 1; l <= lmax; l++) {
+    // const int idx = l * (l + 1) + 1; // (l, 1)
+
+    if (l == 1) {
+      // l=1, m=1
+      plm_idx = -sq3o2 * Y00;
+    } else if (l == 2) {
+      const double t = dl(l) * plm_idx1;
+      plm_idx = t * rz;
+    } else {
+      plm_idx = alm(idx_sph) * (rz * plm_idx1 + blm(idx_sph) * plm_idx2);
+    }
+
+    ylm = phase * plm_idx;
+
+    for (int n = 0; n < nradmax; n++) {
+      Kokkos::atomic_add(&A_sph(ii, mu_j, idx_sph, n).re, fr(ii, jj, l, n) * ylm.re);
+      Kokkos::atomic_add(&A_sph(ii, mu_j, idx_sph, n).im, fr(ii, jj, l, n) * ylm.im);
+    }
+
+    plm_idx2 = plm_idx1;
+    plm_idx1 = plm_idx;
+
+    idx_sph++;
+  }
+
+  plm_idx = plm_idx1 = plm_idx2 = 0.0;
+
+  double plm_mm1_mm1 = -sq3o2 * Y00; // (1, 1)
+
+  // m > 1
+  phasem = phase;
+  for (int m = 2; m <= lmax; m++) {
+
+    mphasem1.re = phasem.re * double(m);
+    mphasem1.im = phasem.im * double(m);
+    phasem = phasem * phase;
+
+    for (int l = m; l <= lmax; l++) {
+      // const int idx = l * (l + 1) + m;
+
+      if (l == m) {
+        plm_idx = cl(l) * plm_mm1_mm1; // (m+1, m)
+        plm_mm1_mm1 = plm_idx;
+      } else if (l == (m + 1)) {
+        const double t = dl(l) * plm_mm1_mm1; // (m - 1, m - 1)
+        plm_idx = t * rz; // (m, m)
+      } else {
+        plm_idx = alm(idx_sph) * (rz * plm_idx1 + blm(idx_sph) * plm_idx2);
       }
+
+      ylm.re = phasem.re * plm_idx;
+      ylm.im = phasem.im * plm_idx;
+
+      for (int n = 0; n < nradmax; n++) {
+        Kokkos::atomic_add(&A_sph(ii, mu_j, idx_sph, n).re, fr(ii, jj, l, n) * ylm.re);
+        Kokkos::atomic_add(&A_sph(ii, mu_j, idx_sph, n).im, fr(ii, jj, l, n) * ylm.im);
+      }
+
+      plm_idx2 = plm_idx1;
+      plm_idx1 = plm_idx;
+
+      idx_sph++;
     }
   }
 
@@ -1019,17 +1098,35 @@ template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEConjugateAi, const int& ii) const
 {
-  //complex conjugate A's (for NEGATIVE (-m) terms)
-  // for rank > 1
   for (int mu_j = 0; mu_j < nelements; mu_j++) {
-    for (int n = 0; n < nradmax; n++) {
-      for (int l = 0; l <= lmax; l++) {
+
+    // transpose
+
+    int idx_sph = 0;
+
+    for (int m = 0; m <= lmax; m++) {
+      for (int l = m; l <= lmax; l++) {
+        const int idx = l * (l + 1) + m;
+        for (int n = 0; n < nradmax; n++) {
+          A(ii, mu_j, idx, n) = A_sph(ii, mu_j, idx_sph, n);
+        }
+
+        idx_sph++;
+      }
+    }
+
+    // complex conjugate A's (for NEGATIVE (-m) terms)
+    //  for rank > 1
+
+    for (int l = 0; l <= lmax; l++) {
         //fill in -m part in the outer loop using the same m <-> -m symmetry as for Ylm
-        for (int m = 1; m <= l; m++) {
-          const int idx = l * (l + 1) + m; // (l, m)
-          const int idxm = l * (l + 1) - m; // (l, -m)
-          const int factor = m % 2 == 0 ? 1 : -1;
-          A(ii, mu_j, n, idxm) = A(ii, mu_j, n, idx).conj() * (double)factor;
+      for (int m = 1; m <= l; m++) {
+        const int idx = l * (l + 1) + m; // (l, m)
+        const int idxm = l * (l + 1) - m; // (l, -m)
+        const int idx_sph = d_idx_sph(idx);
+        const int factor = m % 2 == 0 ? 1 : -1;
+        for (int n = 0; n < nradmax; n++) {
+          A(ii, mu_j, idxm, n) = A_sph(ii, mu_j, idx_sph, n).conj() * (double)factor;
         }
       }
     }
@@ -1042,73 +1139,72 @@ template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeRho, const int& iter) const
 {
-  const int idx_ms_comb = iter / chunk_size;
+  const int idx_ms_combs = iter / chunk_size;
   const int ii = iter % chunk_size;
 
   const int i = d_ilist[ii + chunk_offset];
   const int mu_i = d_map(type(i));
 
-  if (idx_ms_comb >= d_idx_ms_combs_count(mu_i)) return;
+  if (idx_ms_combs >= d_idx_ms_combs_count(mu_i)) return;
 
   const int ndensity = d_ndensity(mu_i);
 
-  const int func_ind = d_func_inds(mu_i, idx_ms_comb);
-  const int rank = d_rank(mu_i, func_ind);
+  const int idx_func = d_idx_funcs(mu_i, idx_ms_combs);
+  const int rank = d_rank(mu_i, idx_func);
   const int r = rank - 1;
 
   // Basis functions B with iterative product and density rho(p) calculation
   if (rank == 1) {
-    const int mu = d_mus(mu_i, func_ind, 0);
-    const int n = d_ns(mu_i, func_ind, 0);
+    const int mu = d_mus(mu_i, idx_func, 0);
+    const int n = d_ns(mu_i, idx_func, 0);
     double A_cur = A_rank1(ii, mu, n - 1);
     for (int p = 0; p < ndensity; ++p) {
       //for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1
-      Kokkos::atomic_add(&rhos(ii, p), d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb) * A_cur);
+      Kokkos::atomic_add(&rhos(ii, p), d_coeffs(mu_i, idx_func, p) * d_gen_cgs(mu_i, idx_ms_combs) * A_cur);
     }
 
-
-    //gamma_i
+    // gamma_i
     if (flag_compute_extrapolation_grade)
-        Kokkos::atomic_add(&projections(ii, func_ind),  d_gen_cgs(mu_i, idx_ms_comb) * A_cur);
+      Kokkos::atomic_add(&projections(ii, idx_func),  d_gen_cgs(mu_i, idx_ms_combs) * A_cur);
 
   } else { // rank > 1
     // loop over {ms} combinations in sum
 
     // loop over m, collect B  = product of A with given ms
-    A_forward_prod(ii, idx_ms_comb, 0) = complex::one();
+    A_forward_prod(ii, idx_ms_combs, 0) = complex::one();
 
     // fill forward A-product triangle
     for (int t = 0; t < rank; t++) {
       //TODO: optimize ns[t]-1 -> ns[t] during functions construction
-      const int mu = d_mus(mu_i, func_ind, t);
-      const int n = d_ns(mu_i, func_ind, t);
-      const int l = d_ls(mu_i, func_ind, t);
-      const int m = d_ms_combs(mu_i, idx_ms_comb, t); // current ms-combination (of length = rank)
+      const int mu = d_mus(mu_i, idx_func, t);
+      const int n = d_ns(mu_i, idx_func, t);
+      const int l = d_ls(mu_i, idx_func, t);
+      const int m = d_ms_combs(mu_i, idx_ms_combs, t); // current ms-combination (of length = rank)
       const int idx = l * (l + 1) + m; // (l, m)
-      A_list(ii, idx_ms_comb, t) = A(ii, mu, n - 1, idx);
-      A_forward_prod(ii, idx_ms_comb, t + 1) = A_forward_prod(ii, idx_ms_comb, t) * A_list(ii, idx_ms_comb, t);
+      A_list(ii, idx_ms_combs, t) = A(ii, mu, idx, n - 1);
+      A_forward_prod(ii, idx_ms_combs, t + 1) = A_forward_prod(ii, idx_ms_combs, t) * A_list(ii, idx_ms_combs, t);
     }
 
     complex A_backward_prod = complex::one();
 
     // fill backward A-product triangle
     for (int t = r; t >= 1; t--) {
-      const complex dB = A_forward_prod(ii, idx_ms_comb, t) * A_backward_prod; // dB - product of all A's except t-th
-      dB_flatten(ii, idx_ms_comb, t) = dB;
+      const complex dB = A_forward_prod(ii, idx_ms_combs, t) * A_backward_prod; // dB - product of all A's except t-th
+      dB_flatten(ii, idx_ms_combs, t) = dB;
 
-      A_backward_prod = A_backward_prod * A_list(ii, idx_ms_comb, t);
+      A_backward_prod = A_backward_prod * A_list(ii, idx_ms_combs, t);
     }
-    dB_flatten(ii, idx_ms_comb, 0) = A_forward_prod(ii, idx_ms_comb, 0) * A_backward_prod;
+    dB_flatten(ii, idx_ms_combs, 0) = A_forward_prod(ii, idx_ms_combs, 0) * A_backward_prod;
 
-    const complex B = A_forward_prod(ii, idx_ms_comb, rank);
+    const complex B = A_forward_prod(ii, idx_ms_combs, rank);
 
     for (int p = 0; p < ndensity; ++p) {
       // real-part only multiplication
-      Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb)));
+      Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_coeffs(mu_i, idx_func, p) * d_gen_cgs(mu_i, idx_ms_combs)));
     }
-    //gamma_i
+    // gamma_i
     if (flag_compute_extrapolation_grade)
-      Kokkos::atomic_add(&projections(ii, func_ind),  B.real_part_product(d_gen_cgs(mu_i, idx_ms_comb)));
+      Kokkos::atomic_add(&projections(ii, idx_func),  B.real_part_product(d_gen_cgs(mu_i, idx_ms_combs)));
   }
 }
 
@@ -1129,7 +1225,6 @@ void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeFS,
   double evdwl_cut;
   evdwl = fcut = dfcut = 0.0;
 
-  inner_cutoff(rho_core(ii), rho_cut, drho_cut, fcut, dfcut);
   FS_values_and_derivatives(ii, evdwl, mu_i);
 
   if (is_zbl) {
@@ -1155,7 +1250,6 @@ void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeFS,
   for (int p = 0; p < ndensity; ++p)
     dF_drho(ii, p) *= fcut;
 
-
   // tally energy contribution
   if (eflag) {
     // E0 shift
@@ -1201,52 +1295,58 @@ template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeWeights, const int& iter) const
 {
-  const int idx_ms_comb = iter / chunk_size;
+  const int idx_ms_combs = iter / chunk_size;
   const int ii = iter % chunk_size;
 
   const int i = d_ilist[ii + chunk_offset];
   const int mu_i = d_map(type(i));
 
-  if (idx_ms_comb >= d_idx_ms_combs_count(mu_i)) return;
+  if (idx_ms_combs >= d_idx_ms_combs_count(mu_i)) return;
 
   const int ndensity = d_ndensity(mu_i);
 
-  const int func_ind = d_func_inds(mu_i, idx_ms_comb);
-  const int rank = d_rank(mu_i, func_ind);
+  const int idx_func = d_idx_funcs(mu_i, idx_ms_combs);
+  const int rank = d_rank(mu_i, idx_func);
 
   // Weights and theta calculation
 
   if (rank == 1) {
-    const int mu = d_mus(mu_i, func_ind, 0);
-    const int n = d_ns(mu_i, func_ind, 0);
+    const int mu = d_mus(mu_i, idx_func, 0);
+    const int n = d_ns(mu_i, idx_func, 0);
     double theta = 0.0;
     for (int p = 0; p < ndensity; ++p) {
       // for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1
-      theta += dF_drho(ii, p) * d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb);
+      theta += dF_drho(ii, p) * d_coeffs(mu_i, idx_func, p) * d_gen_cgs(mu_i, idx_ms_combs);
     }
     Kokkos::atomic_add(&weights_rank1(ii, mu, n - 1), theta);
   } else { // rank > 1
     double theta = 0.0;
     for (int p = 0; p < ndensity; ++p)
-      theta += dF_drho(ii, p) * d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb);
+      theta += dF_drho(ii, p) * d_coeffs(mu_i, idx_func, p) * d_gen_cgs(mu_i, idx_ms_combs);
 
     theta *= 0.5; // 0.5 factor due to possible double counting ???
     for (int t = 0; t < rank; ++t) {
-      const int m_t = d_ms_combs(mu_i, idx_ms_comb, t);
+      const int m_t = d_ms_combs(mu_i, idx_ms_combs, t);
       const int factor = (m_t % 2 == 0 ? 1 : -1);
-      const complex dB = dB_flatten(ii, idx_ms_comb, t);
-      const int mu_t = d_mus(mu_i, func_ind, t);
-      const int n_t = d_ns(mu_i, func_ind, t);
-      const int l_t = d_ls(mu_i, func_ind, t);
+      const complex dB = dB_flatten(ii, idx_ms_combs, t);
+      const int mu_t = d_mus(mu_i, idx_func, t);
+      const int n_t = d_ns(mu_i, idx_func, t);
+      const int l_t = d_ls(mu_i, idx_func, t);
       const int idx = l_t * (l_t + 1) + m_t; // (l, m)
-      const complex value = theta * dB;
-      Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idx).re), value.re);
-      Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idx).im), value.im);
+      const int idx_sph = d_idx_sph(idx);
+      if (idx_sph >= 0) {
+        const complex value = theta * dB;
+        Kokkos::atomic_add(&(weights(ii, mu_t, idx_sph, n_t - 1).re), value.re);
+        Kokkos::atomic_add(&(weights(ii, mu_t, idx_sph, n_t - 1).im), value.im);
+      }
       // update -m_t (that could also be positive), because the basis is half_basis
       const int idxm = l_t * (l_t + 1) - m_t; // (l, -m)
-      const complex valuem = theta * dB.conj() * (double)factor;
-      Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idxm).re), valuem.re);
-      Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idxm).im), valuem.im);
+      const int idxm_sph = d_idx_sph(idxm);
+      if (idxm_sph >= 0) {
+        const complex valuem = theta * dB.conj() * (double)factor;
+        Kokkos::atomic_add(&(weights(ii, mu_t, idxm_sph, n_t - 1).re), valuem.re);
+        Kokkos::atomic_add(&(weights(ii, mu_t, idxm_sph, n_t - 1).im), valuem.im);
+      }
     }
   }
 }
@@ -1293,37 +1393,239 @@ void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeDeri
   }
 
   // for rank > 1
-  for (int n = 0; n < nradmax; n++) {
-    for (int l = 0; l <= lmax; l++) {
-      const double R_over_r = fr(ii, jj, n, l) * rinv;
-      const double DR = dfr(ii, jj, n, l);
 
-      // for m >= 0
-      for (int m = 0; m <= l; m++) {
-        const int idx = l * (l + 1) + m; // (l, m)
-        complex w = weights(ii, mu_j, n, idx);
+  // compute plm, dplm, ylm and dylm
+  // requires rx^2 + ry^2 + rz^2 = 1 , NO CHECKING IS PERFORMED !!!!!!!!!
+  // requires -1 <= rz <= 1 , NO CHECKING IS PERFORMED !!!!!!!!!
+  // prefactors include 1/sqrt(2) factor compared to reference
+
+  complex ylm,dylm[3];
+  complex phase;
+  complex phasem, mphasem1;
+  complex dyx, dyy, dyz;
+  complex rdy;
+
+  const double rx = d_rhats(ii, jj, 0);
+  const double ry = d_rhats(ii, jj, 1);
+  const double rz = d_rhats(ii, jj, 2);
+
+  phase.re = rx;
+  phase.im = ry;
+
+  double plm_idx,plm_idx1,plm_idx2;
+  double dplm_idx,dplm_idx1,dplm_idx2;
+
+  plm_idx = plm_idx1 = plm_idx2 = 0.0;
+  dplm_idx = dplm_idx1 = dplm_idx2 = 0.0;
+
+  int idx_sph = 0;
+
+  // m = 0
+  for (int l = 0; l <= lmax; l++) {
+    // const int idx = l * (l + 1);
+
+    if (l == 0) {
+      // l=0, m=0
+      // plm[0] = Y00/sq1o4pi; //= sq1o4pi;
+      plm_idx = Y00; //= 1;
+      dplm_idx = 0.0;
+    } else if (l == 1) {
+      // l=1, m=0
+      plm_idx = Y00 * sq3 * rz;
+      dplm_idx = Y00 * sq3;
+    } else {
+      // l>=2, m=0
+      plm_idx = alm(idx_sph) * (rz * plm_idx1 + blm(idx_sph) * plm_idx2);
+      dplm_idx = alm(idx_sph) * (plm_idx1 + rz * dplm_idx1 + blm(idx_sph) * dplm_idx2);
+    }
+
+    ylm.re = plm_idx;
+    ylm.im = 0.0;
+
+    dyz.re = dplm_idx;
+    rdy.re = dyz.re * rz;
+
+    dylm[0].re = -rdy.re * rx;
+    dylm[0].im = 0.0;
+    dylm[1].re = -rdy.re * ry;
+    dylm[1].im = 0.0;
+    dylm[2].re = dyz.re - rdy.re * rz;
+    dylm[2].im = 0;
+
+    for (int n = 0; n < nradmax; n++) {
+
+      const double R_over_r = fr(ii, jj, l, n) * rinv;
+      const double DR = dfr(ii, jj, l, n);
+      const complex Y_DR = ylm * DR;
+
+      complex w = weights(ii, mu_j, idx_sph, n);
+      if (w.re == 0.0 && w.im == 0.0) continue;
+
+      complex grad_phi_nlm[3];
+      grad_phi_nlm[0] = Y_DR * r_hat[0] + dylm[0] * R_over_r;
+      grad_phi_nlm[1] = Y_DR * r_hat[1] + dylm[1] * R_over_r;
+      grad_phi_nlm[2] = Y_DR * r_hat[2] + dylm[2] * R_over_r;
+      // real-part multiplication only
+      f_ji[0] += w.real_part_product(grad_phi_nlm[0]);
+      f_ji[1] += w.real_part_product(grad_phi_nlm[1]);
+      f_ji[2] += w.real_part_product(grad_phi_nlm[2]);
+    }
+
+    plm_idx2 = plm_idx1;
+    dplm_idx2 = dplm_idx1;
+
+    plm_idx1 = plm_idx;
+    dplm_idx1 = dplm_idx;
+
+    idx_sph++;
+  }
+
+  plm_idx = plm_idx1 = plm_idx2 = 0.0;
+  dplm_idx = dplm_idx1 = dplm_idx2 = 0.0;
+
+  // m = 1
+  for (int l = 1; l <= lmax; l++) {
+    // const int idx = l * (l + 1) + 1; // (l, 1)
+
+    if (l == 1) {
+      // l=1, m=1
+      plm_idx = -sq3o2 * Y00;
+      dplm_idx = 0.0;
+    } else if (l == 2) {
+      const double t = dl(l) * plm_idx1;
+      plm_idx = t * rz;
+      dplm_idx = t;
+    } else {
+      plm_idx = alm(idx_sph) * (rz * plm_idx1 + blm(idx_sph) * plm_idx2);
+      dplm_idx = alm(idx_sph) * (plm_idx1 + rz * dplm_idx1 + blm(idx_sph) * dplm_idx2);
+    }
+
+    ylm = phase * plm_idx;
+
+    dyx.re = plm_idx;
+    dyx.im = 0.0;
+    dyy.re = 0.0;
+    dyy.im = plm_idx;
+    dyz.re = phase.re * dplm_idx;
+    dyz.im = phase.im * dplm_idx;
+
+    rdy.re = rx * dyx.re + +rz * dyz.re;
+    rdy.im = ry * dyy.im + rz * dyz.im;
+
+    dylm[0].re = dyx.re - rdy.re * rx;
+    dylm[0].im = -rdy.im * rx;
+    dylm[1].re = -rdy.re * ry;
+    dylm[1].im = dyy.im - rdy.im * ry;
+    dylm[2].re = dyz.re - rdy.re * rz;
+    dylm[2].im = dyz.im - rdy.im * rz;
+
+    for (int n = 0; n < nradmax; n++) {
+
+      const double R_over_r = fr(ii, jj, l, n) * rinv;
+      const double DR = dfr(ii, jj, l, n);
+      const complex Y_DR = ylm * DR;
+
+      complex w = weights(ii, mu_j, idx_sph, n);
+      if (w.re == 0.0 && w.im == 0.0) continue;
+      // counting for -m cases if m > 0
+      w.re *= 2.0;
+      w.im *= 2.0;
+
+      complex grad_phi_nlm[3];
+      grad_phi_nlm[0] = Y_DR * r_hat[0] + dylm[0] * R_over_r;
+      grad_phi_nlm[1] = Y_DR * r_hat[1] + dylm[1] * R_over_r;
+      grad_phi_nlm[2] = Y_DR * r_hat[2] + dylm[2] * R_over_r;
+      // real-part multiplication only
+      f_ji[0] += w.real_part_product(grad_phi_nlm[0]);
+      f_ji[1] += w.real_part_product(grad_phi_nlm[1]);
+      f_ji[2] += w.real_part_product(grad_phi_nlm[2]);
+    }
+
+    plm_idx2 = plm_idx1;
+    dplm_idx2 = dplm_idx1;
+
+    plm_idx1 = plm_idx;
+    dplm_idx1 = dplm_idx;
+
+    idx_sph++;
+  }
+
+  plm_idx = plm_idx1 = plm_idx2 = 0.0;
+  dplm_idx = dplm_idx1 = dplm_idx2 = 0.0;
+
+  double plm_mm1_mm1 = -sq3o2 * Y00; // (1, 1)
+
+  // m > 1
+  phasem = phase;
+  for (int m = 2; m <= lmax; m++) {
+
+    mphasem1.re = phasem.re * double(m);
+    mphasem1.im = phasem.im * double(m);
+    phasem = phasem * phase;
+
+    for (int l = m; l <= lmax; l++) {
+      // const int idx = l * (l + 1) + m;
+
+      if (l == m) {
+        plm_idx = cl(l) * plm_mm1_mm1; // (m+1, m)
+        dplm_idx = 0.0;
+        plm_mm1_mm1 = plm_idx;
+      } else if (l == (m + 1)) {
+        const double t = dl(l) * plm_mm1_mm1; // (m - 1, m - 1)
+        plm_idx = t * rz; // (m, m)
+        dplm_idx = t;
+      } else {
+        plm_idx = alm(idx_sph) * (rz * plm_idx1 + blm(idx_sph) * plm_idx2);
+        dplm_idx = alm(idx_sph) * (plm_idx1 + rz * dplm_idx1 + blm(idx_sph) * dplm_idx2);
+      }
+
+      ylm.re = phasem.re * plm_idx;
+      ylm.im = phasem.im * plm_idx;
+
+      dyx = mphasem1 * plm_idx;
+      dyy.re = -dyx.im;
+      dyy.im = dyx.re;
+      dyz = phasem * dplm_idx;
+
+      rdy.re = rx * dyx.re + ry * dyy.re + rz * dyz.re;
+      rdy.im = rx * dyx.im + ry * dyy.im + rz * dyz.im;
+
+      dylm[0].re = dyx.re - rdy.re * rx;
+      dylm[0].im = dyx.im - rdy.im * rx;
+      dylm[1].re = dyy.re - rdy.re * ry;
+      dylm[1].im = dyy.im - rdy.im * ry;
+      dylm[2].re = dyz.re - rdy.re * rz;
+      dylm[2].im = dyz.im - rdy.im * rz;
+
+      for (int n = 0; n < nradmax; n++) {
+
+        const double R_over_r = fr(ii, jj, l, n) * rinv;
+        const double DR = dfr(ii, jj, l, n);
+        const complex Y_DR = ylm * DR;
+
+        complex w = weights(ii, mu_j, idx_sph, n);
         if (w.re == 0.0 && w.im == 0.0) continue;
         // counting for -m cases if m > 0
-        if (m > 0) {
-          w.re *= 2.0;
-          w.im *= 2.0;
-        }
-
-        complex DY[3];
-        DY[0] = dylm(ii, jj, idx, 0);
-        DY[1] = dylm(ii, jj, idx, 1);
-        DY[2] = dylm(ii, jj, idx, 2);
-        const complex Y_DR = ylm(ii, jj, idx) * DR;
+        w.re *= 2.0;
+        w.im *= 2.0;
 
         complex grad_phi_nlm[3];
-        grad_phi_nlm[0] = Y_DR * r_hat[0] + DY[0] * R_over_r;
-        grad_phi_nlm[1] = Y_DR * r_hat[1] + DY[1] * R_over_r;
-        grad_phi_nlm[2] = Y_DR * r_hat[2] + DY[2] * R_over_r;
+        grad_phi_nlm[0] = Y_DR * r_hat[0] + dylm[0] * R_over_r;
+        grad_phi_nlm[1] = Y_DR * r_hat[1] + dylm[1] * R_over_r;
+        grad_phi_nlm[2] = Y_DR * r_hat[2] + dylm[2] * R_over_r;
         // real-part multiplication only
         f_ji[0] += w.real_part_product(grad_phi_nlm[0]);
         f_ji[1] += w.real_part_product(grad_phi_nlm[1]);
         f_ji[2] += w.real_part_product(grad_phi_nlm[2]);
       }
+
+      plm_idx2 = plm_idx1;
+      dplm_idx2 = dplm_idx1;
+
+      plm_idx1 = plm_idx;
+      dplm_idx1 = dplm_idx;
+
+      idx_sph++;
     }
   }
 
@@ -1461,31 +1763,46 @@ void PairPACEExtrapolationKokkos<DeviceType>::v_tally_xyz(EV_FLOAT &ev, const in
 template<class DeviceType>
 void PairPACEExtrapolationKokkos<DeviceType>::pre_compute_harmonics(int lmax)
 {
+  auto h_idx_sph = Kokkos::create_mirror_view(d_idx_sph);
   auto h_alm = Kokkos::create_mirror_view(alm);
   auto h_blm = Kokkos::create_mirror_view(blm);
   auto h_cl = Kokkos::create_mirror_view(cl);
   auto h_dl = Kokkos::create_mirror_view(dl);
 
-  for (int l = 1; l <= lmax; l++) {
-    const double lsq = l * l;
-    const double ld = 2 * l;
-    const double l1 = (4 * lsq - 1);
-    const double l2 = lsq - ld + 1;
-    for (int m = 0; m < l - 1; m++) {
-      const double msq = m * m;
-      const double a = sqrt((double(l1)) / (double(lsq - msq)));
-      const double b = -sqrt((double(l2 - msq)) / (double(4 * l2 - 1)));
+  Kokkos::deep_copy(h_idx_sph,-1);
+
+  int idx_sph = 0;
+  for (int m = 0; m <= lmax; m++) {
+    const double msq = m * m;
+    for (int l = m; l <= lmax; l++) {
       const int idx = l * (l + 1) + m; // (l, m)
-      h_alm(idx) = a;
-      h_blm(idx) = b;
+      h_idx_sph(idx) = idx_sph;
+
+      double a = 0.0;
+      double b = 0.0;
+
+      if (l > 1 && l != m) {
+        const double lsq = l * l;
+        const double ld = 2 * l;
+        const double l1 = (4 * lsq - 1);
+        const double l2 = lsq - ld + 1;
+
+        a = sqrt((double(l1)) / (double(lsq - msq)));
+        b = -sqrt((double(l2 - msq)) / (double(4 * l2 - 1)));
+      }
+      h_alm(idx_sph) = a;
+      h_blm(idx_sph) = b;
+      idx_sph++;
     }
   }
+  idx_sph_max = idx_sph;
 
   for (int l = 1; l <= lmax; l++) {
     h_cl(l) = -sqrt(1.0 + 0.5 / (double(l)));
     h_dl(l) = sqrt(double(2 * (l - 1) + 3));
   }
 
+  Kokkos::deep_copy(d_idx_sph, h_idx_sph);
   Kokkos::deep_copy(alm, h_alm);
   Kokkos::deep_copy(blm, h_blm);
   Kokkos::deep_copy(cl, h_cl);
@@ -1494,143 +1811,6 @@ void PairPACEExtrapolationKokkos<DeviceType>::pre_compute_harmonics(int lmax)
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairPACEExtrapolationKokkos<DeviceType>::compute_barplm(int ii, int jj, double rz, int lmax) const
-{
-  // requires -1 <= rz <= 1 , NO CHECKING IS PERFORMED !!!!!!!!!
-  // prefactors include 1/sqrt(2) factor compared to reference
-
-  // l=0, m=0
-  // plm(ii, jj, 0, 0) = Y00/sq1o4pi; //= sq1o4pi;
-  plm(ii, jj, 0) = Y00; //= 1;
-  dplm(ii, jj, 0) = 0.0;
-
-  if (lmax > 0) {
-
-    // l=1, m=0
-    plm(ii, jj, 2) = Y00 * sq3 * rz;
-    dplm(ii, jj, 2) = Y00 * sq3;
-
-    // l=1, m=1
-    plm(ii, jj, 3) = -sq3o2 * Y00;
-    dplm(ii, jj, 3) = 0.0;
-
-    // loop l = 2, lmax
-    for (int l = 2; l <= lmax; l++) {
-      for (int m = 0; m < l - 1; m++) {
-        const int idx = l * (l + 1) + m; // (l, m)
-        const int idx1 = (l - 1) * l + m; // (l - 1, m)
-        const int idx2 = (l - 2) * (l - 1) + m; // (l - 2, m)
-        plm(ii, jj, idx) = alm(idx) * (rz * plm(ii, jj, idx1) + blm(idx) * plm(ii, jj, idx2));
-        dplm(ii, jj, idx) = alm(idx) * (plm(ii, jj, idx1) + rz * dplm(ii, jj, idx1) + blm(idx) * dplm(ii, jj, idx2));
-      }
-      const int idx = l * (l + 1) + l; // (l, l)
-      const int idx1 = l * (l + 1) + l - 1; // (l, l - 1)
-      const int idx2 = (l - 1) * l + l - 1; // (l - 1, l - 1)
-      const double t = dl(l) * plm(ii, jj, idx2);
-      plm(ii, jj, idx1) = t * rz;
-      dplm(ii, jj, idx1) = t;
-      plm(ii, jj, idx) = cl(l) * plm(ii, jj, idx2);
-      dplm(ii, jj, idx) = 0.0;
-    }
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairPACEExtrapolationKokkos<DeviceType>::compute_ylm(int ii, int jj, double rx, double ry, double rz, int lmax) const
-{
-  // requires rx^2 + ry^2 + rz^2 = 1 , NO CHECKING IS PERFORMED !!!!!!!!!
-
-  complex phase;
-  complex phasem, mphasem1;
-  complex dyx, dyy, dyz;
-  complex rdy;
-
-  phase.re = rx;
-  phase.im = ry;
-
-  // compute barplm
-  compute_barplm(ii, jj, rz, lmax);
-
-  // m = 0
-  for (int l = 0; l <= lmax; l++) {
-    const int idx = l * (l + 1);
-
-    ylm(ii, jj, idx).re = plm(ii, jj, idx);
-    ylm(ii, jj, idx).im = 0.0;
-
-    dyz.re = dplm(ii, jj, idx);
-    rdy.re = dyz.re * rz;
-
-    dylm(ii, jj, idx, 0).re = -rdy.re * rx;
-    dylm(ii, jj, idx, 0).im = 0.0;
-    dylm(ii, jj, idx, 1).re = -rdy.re * ry;
-    dylm(ii, jj, idx, 1).im = 0.0;
-    dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz;
-    dylm(ii, jj, idx, 2).im = 0;
-  }
-  // m = 1
-  for (int l = 1; l <= lmax; l++) {
-    const int idx = l * (l + 1) + 1;
-
-    ylm(ii, jj, idx) = phase * plm(ii, jj, idx);
-
-    dyx.re = plm(ii, jj, idx);
-    dyx.im = 0.0;
-    dyy.re = 0.0;
-    dyy.im = plm(ii, jj, idx);
-    dyz.re = phase.re * dplm(ii, jj, idx);
-    dyz.im = phase.im * dplm(ii, jj, idx);
-
-    rdy.re = rx * dyx.re + +rz * dyz.re;
-    rdy.im = ry * dyy.im + rz * dyz.im;
-
-    dylm(ii, jj, idx, 0).re = dyx.re - rdy.re * rx;
-    dylm(ii, jj, idx, 0).im = -rdy.im * rx;
-    dylm(ii, jj, idx, 1).re = -rdy.re * ry;
-    dylm(ii, jj, idx, 1).im = dyy.im - rdy.im * ry;
-    dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz;
-    dylm(ii, jj, idx, 2).im = dyz.im - rdy.im * rz;
-  }
-
-  // m > 1
-  phasem = phase;
-  for (int m = 2; m <= lmax; m++) {
-
-    mphasem1.re = phasem.re * double(m);
-    mphasem1.im = phasem.im * double(m);
-    phasem = phasem * phase;
-
-    for (int l = m; l <= lmax; l++) {
-      const int idx = l * (l + 1) + m;
-
-      ylm(ii, jj, idx).re = phasem.re * plm(ii, jj, idx);
-      ylm(ii, jj, idx).im = phasem.im * plm(ii, jj, idx);
-
-      dyx = mphasem1 * plm(ii, jj, idx);
-      dyy.re = -dyx.im;
-      dyy.im = dyx.re;
-      dyz = phasem * dplm(ii, jj, idx);
-
-      rdy.re = rx * dyx.re + ry * dyy.re + rz * dyz.re;
-      rdy.im = rx * dyx.im + ry * dyy.im + rz * dyz.im;
-
-      dylm(ii, jj, idx, 0).re = dyx.re - rdy.re * rx;
-      dylm(ii, jj, idx, 0).im = dyx.im - rdy.im * rx;
-      dylm(ii, jj, idx, 1).re = dyy.re - rdy.re * ry;
-      dylm(ii, jj, idx, 1).im = dyy.im - rdy.im * ry;
-      dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz;
-      dylm(ii, jj, idx, 2).im = dyz.im - rdy.im * rz;
-    }
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEExtrapolationKokkos<DeviceType>::cutoff_func_poly(const double r, const double r_in, const double delta_in, double &fc, double &dfc) const
@@ -1759,11 +1939,11 @@ void PairPACEExtrapolationKokkos<DeviceType>::evaluate_splines(const int ii, con
   spline_gk.calcSplines(ii, jj, r, gr, dgr);
 
   spline_rnl.calcSplines(ii, jj, r, d_values, d_derivatives);
-  for (int kk = 0; kk < (int)fr.extent(2); kk++) {
-    for (int ll = 0; ll < (int)fr.extent(3); ll++) {
-      const int flatten = kk*fr.extent(3) + ll;
-      fr(ii, jj, kk, ll) = d_values(ii, jj, flatten);
-      dfr(ii, jj, kk, ll) = d_derivatives(ii, jj, flatten);
+  for (int ll = 0; ll < (int)fr.extent(2); ll++) {
+    for (int kk = 0; kk < (int)fr.extent(3); kk++) {
+      const int flatten = kk*fr.extent(2) + ll;
+      fr(ii, jj, ll, kk) = d_values(ii, jj, flatten);
+      dfr(ii, jj, ll, kk) = d_derivatives(ii, jj, flatten);
     }
   }
 
@@ -1783,7 +1963,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::SplineInterpolatorKokkos::operator
     rscalelookup = spline.rscalelookup;
     num_of_functions = spline.num_of_functions;
 
-    lookupTable = t_ace_3d4("lookupTable", ntot+1, num_of_functions);
+    lookupTable = t_ace_3d4_lr("lookupTable", ntot+1, num_of_functions);
     auto h_lookupTable = Kokkos::create_mirror_view(lookupTable);
     for (int i = 0; i < ntot+1; i++)
         for (int j = 0; j < num_of_functions; j++)
@@ -1889,10 +2069,6 @@ double PairPACEExtrapolationKokkos<DeviceType>::memory_usage()
   bytes += MemKK::memory_usage(d_derivatives);
   bytes += MemKK::memory_usage(cr);
   bytes += MemKK::memory_usage(dcr);
-  bytes += MemKK::memory_usage(plm);
-  bytes += MemKK::memory_usage(dplm);
-  bytes += MemKK::memory_usage(ylm);
-  bytes += MemKK::memory_usage(dylm);
   bytes += MemKK::memory_usage(d_ncount);
   bytes += MemKK::memory_usage(d_mu);
   bytes += MemKK::memory_usage(d_rhats);
@@ -1911,7 +2087,7 @@ double PairPACEExtrapolationKokkos<DeviceType>::memory_usage()
   bytes += MemKK::memory_usage(d_idx_ms_combs_count);
   bytes += MemKK::memory_usage(d_rank);
   bytes += MemKK::memory_usage(d_num_ms_combs);
-  bytes += MemKK::memory_usage(d_func_inds);
+  bytes += MemKK::memory_usage(d_idx_funcs);
   bytes += MemKK::memory_usage(d_mus);
   bytes += MemKK::memory_usage(d_ns);
   bytes += MemKK::memory_usage(d_ls);
@@ -1940,47 +2116,6 @@ double PairPACEExtrapolationKokkos<DeviceType>::memory_usage()
   return bytes;
 }
 
-/* ----------------------------------------------------------------------
-    extract method for extracting value of scale variable
- ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-void *PairPACEExtrapolationKokkos<DeviceType>::extract(const char *str, int &dim)
-{
-  dim = 0;
-  //check if str=="flag_compute_extrapolation_grade" then compute extrapolation grades on this iteration
-  if (strcmp(str, "gamma_flag") == 0) return (void *) &flag_compute_extrapolation_grade;
-  if (strcmp(str, "corerep_flag") == 0) return (void *) &flag_corerep_factor;
-
-  dim = 2;
-  if (strcmp(str, "scale") == 0) return (void *) scale;
-  return nullptr;
-}
-
-/* ----------------------------------------------------------------------
-   peratom requests from FixPair
-   return ptr to requested data
-   also return ncol = # of quantites per atom
-     0 = per-atom vector
-     1 or more = # of columns in per-atom array
-   return NULL if str is not recognized
----------------------------------------------------------------------- */
-
-template<class DeviceType>
-void *PairPACEExtrapolationKokkos<DeviceType>::extract_peratom(const char *str, int &ncol)
-{
-  if (strcmp(str, "gamma") == 0) {
-    ncol = 0;
-    return (void *) extrapolation_grade_gamma;
-  }
-  if (strcmp(str, "corerep") == 0) {
-    ncol = 0;
-    return (void *) corerep_factor;
-  }
-
-  return nullptr;
-}
-
 /* ---------------------------------------------------------------------- */
 
 namespace LAMMPS_NS {
@@ -1989,4 +2124,3 @@ template class PairPACEExtrapolationKokkos<LMPDeviceType>;
 template class PairPACEExtrapolationKokkos<LMPHostType>;
 #endif
 }
-
diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.h b/src/KOKKOS/pair_pace_extrapolation_kokkos.h
index aa6c49c36d..df8a0c1740 100644
--- a/src/KOKKOS/pair_pace_extrapolation_kokkos.h
+++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.h
@@ -36,7 +36,6 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
  public:
   struct TagPairPACEComputeNeigh{};
   struct TagPairPACEComputeRadial{};
-  struct TagPairPACEComputeYlm{};
   struct TagPairPACEComputeAi{};
   struct TagPairPACEConjugateAi{};
   struct TagPairPACEComputeRho{};
@@ -67,9 +66,6 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairPACEComputeRadial,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeRadial>::member_type& team) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairPACEComputeYlm,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeYlm>::member_type& team) const;
-
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairPACEComputeAi,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeAi>::member_type& team) const;
 
@@ -99,12 +95,8 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>,const int& ii, EV_FLOAT&) const;
 
-
-  void *extract(const char *str, int &dim) override;
-  void *extract_peratom(const char *str, int &ncol) override;
-
  protected:
-  int inum, maxneigh, chunk_size, chunk_offset, idx_ms_combs_max, total_num_functions_max;
+  int inum, maxneigh, chunk_size, chunk_offset, idx_ms_combs_max, total_num_functions_max, idx_sph_max;
   int host_flag;
 
   int eflag, vflag;
@@ -165,12 +157,6 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
       const F_FLOAT &fx, const F_FLOAT &fy, const F_FLOAT &fz,
       const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void compute_barplm(int, int, double, int) const;
-
-  KOKKOS_INLINE_FUNCTION
-  void compute_ylm(int, int, double, double, double, int) const;
-
   KOKKOS_INLINE_FUNCTION
   void cutoff_func_poly(const double, const double, const double, double &, double &) const;
 
@@ -202,15 +188,19 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
 
   typedef Kokkos::View<int*, DeviceType> t_ace_1i;
   typedef Kokkos::View<int**, DeviceType> t_ace_2i;
+  typedef Kokkos::View<int**, Kokkos::LayoutRight, DeviceType> t_ace_2i_lr;
   typedef Kokkos::View<int***, DeviceType> t_ace_3i;
+  typedef Kokkos::View<int***, Kokkos::LayoutRight, DeviceType> t_ace_3i_lr;
   typedef Kokkos::View<int****, DeviceType> t_ace_4i;
   typedef Kokkos::View<double*, DeviceType> t_ace_1d;
   typedef Kokkos::View<double**, DeviceType> t_ace_2d;
+  typedef Kokkos::View<double**, Kokkos::LayoutRight, DeviceType> t_ace_2d_lr;
   typedef Kokkos::View<double*[3], DeviceType> t_ace_2d3;
   typedef Kokkos::View<double***, DeviceType> t_ace_3d;
   typedef Kokkos::View<const double***, DeviceType> tc_ace_3d;
   typedef Kokkos::View<double**[3], DeviceType> t_ace_3d3;
   typedef Kokkos::View<double**[4], DeviceType> t_ace_3d4;
+  typedef Kokkos::View<double**[4], Kokkos::LayoutRight, DeviceType> t_ace_3d4_lr;
   typedef Kokkos::View<double****, DeviceType> t_ace_4d;
   typedef Kokkos::View<complex*, DeviceType> t_ace_1c;
   typedef Kokkos::View<complex**, DeviceType> t_ace_2c;
@@ -260,25 +250,16 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
   th_ace_1d h_gamma;
 
   // Spherical Harmonics
+
   void pre_compute_harmonics(int);
 
-  KOKKOS_INLINE_FUNCTION
-  void compute_barplm(double rz, int lmaxi);
-
-  KOKKOS_INLINE_FUNCTION
-  void compute_ylm(double rx, double ry, double rz, int lmaxi);
-
+  t_ace_4c A_sph;
+  t_ace_1d d_idx_sph;
   t_ace_1d alm;
   t_ace_1d blm;
   t_ace_1d cl;
   t_ace_1d dl;
 
-  t_ace_3d plm;
-  t_ace_3d dplm;
-
-  t_ace_3c ylm;
-  t_ace_4c3 dylm;
-
   // short neigh list
   t_ace_1i d_ncount;
   t_ace_2d d_mu;
@@ -297,20 +278,19 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
   t_ace_1d d_rho_core_cutoff;
   t_ace_1d d_drho_core_cutoff;
   t_ace_1d d_E0vals;
-  t_ace_2d d_wpre;
-  t_ace_2d d_mexp;
+  t_ace_2d_lr d_wpre;
+  t_ace_2d_lr d_mexp;
 
   // tilde
   t_ace_1i d_idx_ms_combs_count;
   t_ace_1i d_total_basis_size;
-  t_ace_2i d_rank;
-  t_ace_2i d_num_ms_combs;
-  t_ace_2i d_func_inds;
-  t_ace_3i d_mus;
-  t_ace_3i d_ns;
-  t_ace_3i d_ls;
-  t_ace_3i d_ms_combs;
-//  t_ace_3d d_ctildes;
+  t_ace_2i_lr d_rank;
+  t_ace_2i_lr d_num_ms_combs;
+  t_ace_2i_lr d_idx_funcs;
+  t_ace_3i_lr d_mus;
+  t_ace_3i_lr d_ns;
+  t_ace_3i_lr d_ls;
+  t_ace_3i_lr d_ms_combs;
   t_ace_2d d_gen_cgs;
   t_ace_3d d_coeffs;
 
@@ -321,12 +301,12 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
     int ntot, nlut, num_of_functions;
     double cutoff, deltaSplineBins, invrscalelookup, rscalelookup;
 
-    t_ace_3d4 lookupTable;
+    t_ace_3d4_lr lookupTable;
 
     void operator=(const SplineInterpolator &spline);
 
     void deallocate() {
-      lookupTable = t_ace_3d4();
+      lookupTable = t_ace_3d4_lr();
     }
 
     double memory_usage() {
diff --git a/src/KOKKOS/pair_pace_kokkos.cpp b/src/KOKKOS/pair_pace_kokkos.cpp
index 805d7f68bb..4046649375 100644
--- a/src/KOKKOS/pair_pace_kokkos.cpp
+++ b/src/KOKKOS/pair_pace_kokkos.cpp
@@ -29,11 +29,13 @@
 #include "neighbor_kokkos.h"
 #include "neigh_request.h"
 
+#include "ace-evaluator/ace_version.h"
+#include "ace-evaluator/ace_radial.h"
+
 #include "ace-evaluator/ace_c_basis.h"
 #include "ace-evaluator/ace_evaluator.h"
 #include "ace-evaluator/ace_recursive.h"
-#include "ace-evaluator/ace_version.h"
-#include "ace-evaluator/ace_radial.h"
+
 #include <cstring>
 
 namespace LAMMPS_NS {
@@ -104,18 +106,19 @@ void PairPACEKokkos<DeviceType>::grow(int natom, int maxneigh)
 
   if ((int)A.extent(0) < natom) {
 
-    MemKK::realloc_kokkos(A, "pace:A", natom, nelements, nradmax + 1, (lmax + 1) * (lmax + 1));
+    MemKK::realloc_kokkos(A_sph, "pace:A_sph", natom, nelements, idx_sph_max, nradmax + 1);
+    MemKK::realloc_kokkos(A, "pace:A", natom, nelements, (lmax + 1) * (lmax + 1), nradmax + 1);
     MemKK::realloc_kokkos(A_rank1, "pace:A_rank1", natom, nelements, nradbase);
 
-    MemKK::realloc_kokkos(A_list, "pace:A_list", natom, idx_rho_max, basis_set->rankmax);
+    MemKK::realloc_kokkos(A_list, "pace:A_list", natom, idx_ms_combs_max, basis_set->rankmax);
     //size is +1 of max to avoid out-of-boundary array access in double-triangular scheme
-    MemKK::realloc_kokkos(A_forward_prod, "pace:A_forward_prod", natom, idx_rho_max, basis_set->rankmax + 1);
+    MemKK::realloc_kokkos(A_forward_prod, "pace:A_forward_prod", natom, idx_ms_combs_max, basis_set->rankmax + 1);
 
     MemKK::realloc_kokkos(e_atom, "pace:e_atom", natom);
     MemKK::realloc_kokkos(rhos, "pace:rhos", natom, basis_set->ndensitymax + 1); // +1 density for core repulsion
     MemKK::realloc_kokkos(dF_drho, "pace:dF_drho", natom, basis_set->ndensitymax + 1); // +1 density for core repulsion
 
-    MemKK::realloc_kokkos(weights, "pace:weights", natom, nelements, nradmax + 1, (lmax + 1) * (lmax + 1));
+    MemKK::realloc_kokkos(weights, "pace:weights", natom, nelements, idx_sph_max, nradmax + 1);
     MemKK::realloc_kokkos(weights_rank1, "pace:weights_rank1", natom, nelements, nradbase);
 
     // hard-core repulsion
@@ -126,14 +129,14 @@ void PairPACEKokkos<DeviceType>::grow(int natom, int maxneigh)
     MemKK::realloc_kokkos(d_jj_min, "pace:j_min_pair", natom);
     MemKK::realloc_kokkos(d_corerep, "pace:corerep", natom); // per-atom corerep
 
-    MemKK::realloc_kokkos(dB_flatten, "pace:dB_flatten", natom, idx_rho_max, basis_set->rankmax);
+    MemKK::realloc_kokkos(dB_flatten, "pace:dB_flatten", natom, idx_ms_combs_max, basis_set->rankmax);
   }
 
-  if (((int)ylm.extent(0) < natom) || ((int)ylm.extent(1) < maxneigh)) {
+  if (((int)fr.extent(0) < natom) || ((int)fr.extent(1) < maxneigh)) {
 
     // radial functions
-    MemKK::realloc_kokkos(fr, "pace:fr", natom, maxneigh, nradmax, lmax + 1);
-    MemKK::realloc_kokkos(dfr, "pace:dfr", natom, maxneigh, nradmax, lmax + 1);
+    MemKK::realloc_kokkos(fr, "pace:fr", natom, maxneigh, lmax + 1, nradmax);
+    MemKK::realloc_kokkos(dfr, "pace:dfr", natom, maxneigh, lmax + 1, nradmax);
     MemKK::realloc_kokkos(gr, "pace:gr", natom, maxneigh, nradbase);
     MemKK::realloc_kokkos(dgr, "pace:dgr", natom, maxneigh, nradbase);
     const int max_num_functions = MAX(nradbase, nradmax*(lmax + 1));
@@ -144,12 +147,6 @@ void PairPACEKokkos<DeviceType>::grow(int natom, int maxneigh)
     MemKK::realloc_kokkos(cr, "pace:cr", natom, maxneigh);
     MemKK::realloc_kokkos(dcr, "pace:dcr", natom, maxneigh);
 
-    // spherical harmonics
-    MemKK::realloc_kokkos(plm, "pace:plm", natom, maxneigh, (lmax + 1) * (lmax + 1));
-    MemKK::realloc_kokkos(dplm, "pace:dplm", natom, maxneigh, (lmax + 1) * (lmax + 1));
-    MemKK::realloc_kokkos(ylm, "pace:ylm", natom, maxneigh, (lmax + 1) * (lmax + 1));
-    MemKK::realloc_kokkos(dylm, "pace:dylm", natom, maxneigh, (lmax + 1) * (lmax + 1));
-
     // short neigh list
     MemKK::realloc_kokkos(d_ncount, "pace:ncount", natom);
     MemKK::realloc_kokkos(d_mu, "pace:mu", natom, maxneigh);
@@ -184,7 +181,7 @@ void PairPACEKokkos<DeviceType>::copy_pertype()
     h_rho_core_cutoff[n] = basis_set->map_embedding_specifications.at(n).rho_core_cutoff;
     h_drho_core_cutoff[n] = basis_set->map_embedding_specifications.at(n).drho_core_cutoff;
 
-    h_E0vals(n)= basis_set->E0vals(n);
+    h_E0vals(n) = basis_set->E0vals(n);
 
     h_ndensity(n) = basis_set->map_embedding_specifications.at(n).ndensity;
 
@@ -225,10 +222,10 @@ void PairPACEKokkos<DeviceType>::copy_pertype()
   auto h_dcut_in = Kokkos::create_mirror_view(d_dcut_in);
 
   for (int mu_i = 0; mu_i < nelements; ++mu_i) {
-        for (int mu_j = 0; mu_j < nelements; ++mu_j) {
-            h_cut_in(mu_i,mu_j) = basis_set->map_bond_specifications.at({mu_i,mu_j}).rcut_in;
-            h_dcut_in(mu_i,mu_j) = basis_set->map_bond_specifications.at({mu_i,mu_j}).dcut_in;
-        }
+    for (int mu_j = 0; mu_j < nelements; ++mu_j) {
+      h_cut_in(mu_i,mu_j) = basis_set->map_bond_specifications.at({mu_i,mu_j}).rcut_in;
+      h_dcut_in(mu_i,mu_j) = basis_set->map_bond_specifications.at({mu_i,mu_j}).dcut_in;
+    }
   }
   Kokkos::deep_copy(d_cut_in, h_cut_in);
   Kokkos::deep_copy(d_dcut_in, h_dcut_in);
@@ -288,50 +285,50 @@ void PairPACEKokkos<DeviceType>::copy_tilde()
 
   // flatten loops, get per-element count and max
 
-  idx_rho_max = 0;
+  idx_ms_combs_max = 0;
   int total_basis_size_max = 0;
 
-  MemKK::realloc_kokkos(d_idx_rho_count, "pace:idx_rho_count", nelements);
-  auto h_idx_rho_count = Kokkos::create_mirror_view(d_idx_rho_count);
+  MemKK::realloc_kokkos(d_idx_ms_combs_count, "pace:idx_ms_combs_count", nelements);
+  auto h_idx_ms_combs_count = Kokkos::create_mirror_view(d_idx_ms_combs_count);
 
-  for (int n = 0; n < nelements; n++) {
-    int idx_rho = 0;
-    const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[n];
-    const int total_basis_size = basis_set->total_basis_size[n];
+  for (int mu = 0; mu < nelements; mu++) {
+    int idx_ms_combs = 0;
+    const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[mu];
+    const int total_basis_size = basis_set->total_basis_size[mu];
 
-    ACECTildeBasisFunction *basis = basis_set->basis[n];
+    ACECTildeBasisFunction *basis = basis_set->basis[mu];
 
     // rank=1
     for (int func_rank1_ind = 0; func_rank1_ind < total_basis_size_rank1; ++func_rank1_ind)
-      idx_rho++;
+      idx_ms_combs++;
 
     // rank > 1
-    for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) {
-      ACECTildeBasisFunction *func = &basis[func_ind];
+    for (int idx_func = 0; idx_func < total_basis_size; ++idx_func) {
+      ACECTildeBasisFunction *func = &basis[idx_func];
 
       // loop over {ms} combinations in sum
       for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind)
-        idx_rho++;
+        idx_ms_combs++;
     }
-    h_idx_rho_count(n) = idx_rho;
-    idx_rho_max = MAX(idx_rho_max, idx_rho);
+    h_idx_ms_combs_count(mu) = idx_ms_combs;
+    idx_ms_combs_max = MAX(idx_ms_combs_max, idx_ms_combs);
     total_basis_size_max = MAX(total_basis_size_max, total_basis_size_rank1 + total_basis_size);
   }
 
-  Kokkos::deep_copy(d_idx_rho_count, h_idx_rho_count);
+  Kokkos::deep_copy(d_idx_ms_combs_count, h_idx_ms_combs_count);
 
   MemKK::realloc_kokkos(d_rank, "pace:rank", nelements, total_basis_size_max);
   MemKK::realloc_kokkos(d_num_ms_combs, "pace:num_ms_combs", nelements, total_basis_size_max);
-  MemKK::realloc_kokkos(d_offsets, "pace:offsets", nelements, idx_rho_max);
+  MemKK::realloc_kokkos(d_idx_funcs, "pace:idx_func", nelements, idx_ms_combs_max);
   MemKK::realloc_kokkos(d_mus, "pace:mus", nelements, total_basis_size_max, basis_set->rankmax);
   MemKK::realloc_kokkos(d_ns, "pace:ns", nelements, total_basis_size_max, basis_set->rankmax);
   MemKK::realloc_kokkos(d_ls, "pace:ls", nelements, total_basis_size_max, basis_set->rankmax);
-  MemKK::realloc_kokkos(d_ms_combs, "pace:ms_combs", nelements, idx_rho_max, basis_set->rankmax);
-  MemKK::realloc_kokkos(d_ctildes, "pace:ctildes", nelements, idx_rho_max, basis_set->ndensitymax);
+  MemKK::realloc_kokkos(d_ms_combs, "pace:ms_combs", nelements, idx_ms_combs_max, basis_set->rankmax);
+  MemKK::realloc_kokkos(d_ctildes, "pace:ctildes", nelements, idx_ms_combs_max, basis_set->ndensitymax);
 
   auto h_rank = Kokkos::create_mirror_view(d_rank);
   auto h_num_ms_combs = Kokkos::create_mirror_view(d_num_ms_combs);
-  auto h_offsets = Kokkos::create_mirror_view(d_offsets);
+  auto h_idx_funcs = Kokkos::create_mirror_view(d_idx_funcs);
   auto h_mus = Kokkos::create_mirror_view(d_mus);
   auto h_ns = Kokkos::create_mirror_view(d_ns);
   auto h_ls = Kokkos::create_mirror_view(d_ls);
@@ -340,63 +337,66 @@ void PairPACEKokkos<DeviceType>::copy_tilde()
 
   // copy values on host
 
-  for (int n = 0; n < nelements; n++) {
-    const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[n];
-    const int total_basis_size = basis_set->total_basis_size[n];
+  for (int mu = 0; mu < nelements; mu++) {
+    const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[mu];
+    const int total_basis_size = basis_set->total_basis_size[mu];
 
-    ACECTildeBasisFunction *basis_rank1 = basis_set->basis_rank1[n];
-    ACECTildeBasisFunction *basis = basis_set->basis[n];
+    ACECTildeBasisFunction *basis_rank1 = basis_set->basis_rank1[mu];
+    ACECTildeBasisFunction *basis = basis_set->basis[mu];
 
-    const int ndensity = basis_set->map_embedding_specifications.at(n).ndensity;
+    const int ndensity = basis_set->map_embedding_specifications.at(mu).ndensity;
 
-    int idx_rho = 0;
+    int idx_ms_combs = 0;
 
     // rank=1
-    for (int offset = 0; offset < total_basis_size_rank1; ++offset) {
-      ACECTildeBasisFunction *func = &basis_rank1[offset];
-      h_rank(n, offset) = 1;
-      h_mus(n, offset, 0) = func->mus[0];
-      h_ns(n, offset, 0) = func->ns[0];
-      for (int p = 0; p < ndensity; p++)
-        h_ctildes(n, idx_rho, p) = func->ctildes[p];
-      h_offsets(n, idx_rho) = offset;
-      idx_rho++;
+    for (int idx_func = 0; idx_func < total_basis_size_rank1; ++idx_func) {
+      ACECTildeBasisFunction *func = &basis_rank1[idx_func];
+      h_rank(mu, idx_func) = 1;
+      h_mus(mu, idx_func, 0) = func->mus[0];
+      h_ns(mu, idx_func, 0) = func->ns[0];
+
+      for (int p = 0; p < ndensity; ++p)
+        h_ctildes(mu, idx_ms_combs, p) = func->ctildes[p];
+
+      h_idx_funcs(mu, idx_ms_combs) = idx_func;
+      idx_ms_combs++;
     }
 
     // rank > 1
-    for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) {
-      ACECTildeBasisFunction *func = &basis[func_ind];
+    for (int idx_func = 0; idx_func < total_basis_size; ++idx_func) {
+      ACECTildeBasisFunction *func = &basis[idx_func];
       // TODO: check if func->ctildes are zero, then skip
 
-      const int offset = total_basis_size_rank1 + func_ind;
+      const int idx_func_through = total_basis_size_rank1 + idx_func;
 
-      const int rank = h_rank(n, offset) = func->rank;
-      h_num_ms_combs(n, offset) = func->num_ms_combs;
+      const int rank = h_rank(mu, idx_func_through) = func->rank;
+      h_num_ms_combs(mu, idx_func_through) = func->num_ms_combs;
       for (int t = 0; t < rank; t++) {
-        h_mus(n, offset, t) = func->mus[t];
-        h_ns(n, offset, t) = func->ns[t];
-        h_ls(n, offset, t) = func->ls[t];
+        h_mus(mu, idx_func_through, t) = func->mus[t];
+        h_ns(mu, idx_func_through, t) = func->ns[t];
+        h_ls(mu, idx_func_through, t) = func->ls[t];
       }
 
       // loop over {ms} combinations in sum
       for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind) {
         auto ms = &func->ms_combs[ms_ind * rank]; // current ms-combination (of length = rank)
         for (int t = 0; t < rank; t++)
-          h_ms_combs(n, idx_rho, t) = ms[t];
+          h_ms_combs(mu, idx_ms_combs, t) = ms[t];
 
         for (int p = 0; p < ndensity; ++p) {
           // real-part only multiplication
-          h_ctildes(n, idx_rho, p) = func->ctildes[ms_ind * ndensity + p];
+          h_ctildes(mu, idx_ms_combs, p) = func->ctildes[ms_ind * ndensity + p];
         }
-        h_offsets(n, idx_rho) = offset;
-        idx_rho++;
+
+        h_idx_funcs(mu, idx_ms_combs) = idx_func_through;
+        idx_ms_combs++;
       }
     }
   }
 
   Kokkos::deep_copy(d_rank, h_rank);
   Kokkos::deep_copy(d_num_ms_combs, h_num_ms_combs);
-  Kokkos::deep_copy(d_offsets, h_offsets);
+  Kokkos::deep_copy(d_idx_funcs, h_idx_funcs);
   Kokkos::deep_copy(d_mus, h_mus);
   Kokkos::deep_copy(d_ns, h_ns);
   Kokkos::deep_copy(d_ls, h_ls);
@@ -443,6 +443,7 @@ void PairPACEKokkos<DeviceType>::init_style()
 
   // spherical harmonics
 
+  MemKK::realloc_kokkos(d_idx_sph, "pace:idx_sph", (lmax + 1) * (lmax + 1));
   MemKK::realloc_kokkos(alm, "pace:alm", (lmax + 1) * (lmax + 1));
   MemKK::realloc_kokkos(blm, "pace:blm", (lmax + 1) * (lmax + 1));
   MemKK::realloc_kokkos(cl, "pace:cl", lmax + 1);
@@ -613,7 +614,7 @@ void PairPACEKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
     Kokkos::deep_copy(weights, 0.0);
     Kokkos::deep_copy(weights_rank1, 0.0);
-    Kokkos::deep_copy(A, 0.0);
+    Kokkos::deep_copy(A_sph, 0.0);
     Kokkos::deep_copy(A_rank1, 0.0);
     Kokkos::deep_copy(rhos, 0.0);
     Kokkos::deep_copy(rho_core, 0.0);
@@ -646,15 +647,6 @@ void PairPACEKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
       Kokkos::parallel_for("ComputeRadial",policy_radial,*this);
     }
 
-    //ComputeYlm
-    {
-      int vector_length = vector_length_default;
-      int team_size = 16;
-      check_team_size_for<TagPairPACEComputeYlm>(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
-      typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeYlm> policy_ylm(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
-      Kokkos::parallel_for("ComputeYlm",policy_ylm,*this);
-    }
-
     //ComputeAi
     {
       int vector_length = vector_length_default;
@@ -672,7 +664,7 @@ void PairPACEKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
     //ComputeRho
     {
-      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeRho> policy_rho(0,chunk_size*idx_rho_max);
+      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeRho> policy_rho(0,chunk_size*idx_ms_combs_max);
       Kokkos::parallel_for("ComputeRho",policy_rho,*this);
     }
 
@@ -684,7 +676,7 @@ void PairPACEKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
     //ComputeWeights
     {
-      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeWeights> policy_weights(0,chunk_size*idx_rho_max);
+      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeWeights> policy_weights(0,chunk_size * idx_ms_combs_max);
       Kokkos::parallel_for("ComputeWeights",policy_weights,*this);
     }
 
@@ -693,7 +685,7 @@ void PairPACEKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
       int vector_length = vector_length_default;
       int team_size = team_size_default;
       check_team_size_for<TagPairPACEComputeDerivative>(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
-      typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeDerivative> policy_derivative(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      typename Kokkos::TeamPolicy<DeviceType,TagPairPACEComputeDerivative> policy_derivative(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
       Kokkos::parallel_for("ComputeDerivative",policy_derivative,*this);
     }
 
@@ -726,7 +718,6 @@ void PairPACEKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     }
 
     chunk_offset += chunk_size;
-
   } // end while
 
   if (need_dup)
@@ -842,25 +833,24 @@ void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeNeigh,const typen
   });
 
   if (is_zbl) {
-     //adapted from https://www.osti.gov/servlets/purl/1429450
-     if(ncount>0) {
-       using minloc_value_type=Kokkos::MinLoc<F_FLOAT,int>::value_type;
-       minloc_value_type djjmin;
-       djjmin.val=1e20;
-       djjmin.loc=-1;
-       Kokkos::MinLoc<F_FLOAT,int> reducer_scalar(djjmin);
-       // loop over ncount (actual neighbours withing cutoff) rather than jnum (total number of neigh in cutoff+skin)
-       Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, ncount),
+    //adapted from https://www.osti.gov/servlets/purl/1429450
+    if (ncount > 0) {
+      using minloc_value_type=Kokkos::MinLoc<F_FLOAT,int>::value_type;
+      minloc_value_type djjmin;
+      djjmin.val=1e20;
+      djjmin.loc=-1;
+      Kokkos::MinLoc<F_FLOAT,int> reducer_scalar(djjmin);
+      // loop over ncount (actual neighbours withing cutoff) rather than jnum (total number of neigh in cutoff+skin)
+      Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, ncount),
                [&](const int offset, minloc_value_type &min_d_dist) {
                  int j = d_nearest(ii,offset);
                  j &= NEIGHMASK;
-                 const int jtype = type(j);
                  auto r = d_rnorms(ii,offset);
                  const int mu_j = d_map(type(j));
                  const F_FLOAT d = r - (d_cut_in(mu_i, mu_j) - d_dcut_in(mu_i, mu_j));
                  if (d < min_d_dist.val) {
-                     min_d_dist.val = d;
-                     min_d_dist.loc = offset;
+                   min_d_dist.val = d;
+                   min_d_dist.loc = offset;
                  }
        }, reducer_scalar);
       d_d_min(ii) = djjmin.val;
@@ -898,28 +888,6 @@ void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeRadial, const typ
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeYlm, const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeYlm>::member_type& team) const
-{
-  // Extract the atom number
-  int ii = team.team_rank() + team.team_size() * (team.league_rank() %
-           ((chunk_size+team.team_size()-1)/team.team_size()));
-  if (ii >= chunk_size) return;
-
-  // Extract the neighbor number
-  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
-  const int ncount = d_ncount(ii);
-  if (jj >= ncount) return;
-
-  const double xn = d_rhats(ii, jj, 0);
-  const double yn = d_rhats(ii, jj, 1);
-  const double zn = d_rhats(ii, jj, 2);
-  compute_ylm(ii,jj,xn,yn,zn,lmax);
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeAi, const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeAi>::member_type& team) const
@@ -941,13 +909,127 @@ void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeAi, const typenam
     Kokkos::atomic_add(&A_rank1(ii, mu_j, n), gr(ii, jj, n) * Y00);
 
   // rank > 1
-  for (int n = 0; n < nradmax; n++) {
-    for (int l = 0; l <= lmax; l++) {
-      for (int m = 0; m <= l; m++) {
-        const int idx = l * (l + 1) + m; // (l, m)
-        Kokkos::atomic_add(&A(ii, mu_j, n, idx).re, fr(ii, jj, n, l) * ylm(ii, jj, idx).re);
-        Kokkos::atomic_add(&A(ii, mu_j, n, idx).im, fr(ii, jj, n, l) * ylm(ii, jj, idx).im);
+
+  // Compute plm and ylm
+
+  // requires rx^2 + ry^2 + rz^2 = 1 , NO CHECKING IS PERFORMED !!!!!!!!!
+  // requires -1 <= rz <= 1 , NO CHECKING IS PERFORMED !!!!!!!!!
+  // prefactors include 1/sqrt(2) factor compared to reference
+
+  complex ylm, phase;
+  complex phasem, mphasem1;
+  complex dyx, dyy, dyz;
+  complex rdy;
+
+  const double rx = d_rhats(ii, jj, 0);
+  const double ry = d_rhats(ii, jj, 1);
+  const double rz = d_rhats(ii, jj, 2);
+
+  phase.re = rx;
+  phase.im = ry;
+
+  double plm_idx,plm_idx1,plm_idx2;
+
+  plm_idx = plm_idx1 = plm_idx2 = 0.0;
+
+  int idx_sph = 0;
+
+  // m = 0
+  for (int l = 0; l <= lmax; l++) {
+    // const int idx = l * (l + 1);
+
+    if (l == 0) {
+      // l=0, m=0
+      // plm[0] = Y00/sq1o4pi; //= sq1o4pi;
+      plm_idx = Y00; //= 1;
+    } else if (l == 1) {
+      // l=1, m=0
+      plm_idx = Y00 * sq3 * rz;
+    } else {
+      // l>=2, m=0
+      plm_idx = alm(idx_sph) * (rz * plm_idx1 + blm(idx_sph) * plm_idx2);
+    }
+
+    ylm.re = plm_idx;
+    ylm.im = 0.0;
+
+    for (int n = 0; n < nradmax; n++) {
+      Kokkos::atomic_add(&A_sph(ii, mu_j, idx_sph, n).re, fr(ii, jj, l, n) * ylm.re);
+      Kokkos::atomic_add(&A_sph(ii, mu_j, idx_sph, n).im, fr(ii, jj, l, n) * ylm.im);
+    }
+
+    plm_idx2 = plm_idx1;
+    plm_idx1 = plm_idx;
+
+    idx_sph++;
+  }
+
+  plm_idx = plm_idx1 = plm_idx2 = 0.0;
+
+  // m = 1
+  for (int l = 1; l <= lmax; l++) {
+    // const int idx = l * (l + 1) + 1; // (l, 1)
+
+    if (l == 1) {
+      // l=1, m=1
+      plm_idx = -sq3o2 * Y00;
+    } else if (l == 2) {
+      const double t = dl(l) * plm_idx1;
+      plm_idx = t * rz;
+    } else {
+      plm_idx = alm(idx_sph) * (rz * plm_idx1 + blm(idx_sph) * plm_idx2);
+    }
+
+    ylm = phase * plm_idx;
+
+    for (int n = 0; n < nradmax; n++) {
+      Kokkos::atomic_add(&A_sph(ii, mu_j, idx_sph, n).re, fr(ii, jj, l, n) * ylm.re);
+      Kokkos::atomic_add(&A_sph(ii, mu_j, idx_sph, n).im, fr(ii, jj, l, n) * ylm.im);
+    }
+
+    plm_idx2 = plm_idx1;
+    plm_idx1 = plm_idx;
+
+    idx_sph++;
+  }
+
+  plm_idx = plm_idx1 = plm_idx2 = 0.0;
+
+  double plm_mm1_mm1 = -sq3o2 * Y00; // (1, 1)
+
+  // m > 1
+  phasem = phase;
+  for (int m = 2; m <= lmax; m++) {
+
+    mphasem1.re = phasem.re * double(m);
+    mphasem1.im = phasem.im * double(m);
+    phasem = phasem * phase;
+
+    for (int l = m; l <= lmax; l++) {
+      // const int idx = l * (l + 1) + m;
+
+      if (l == m) {
+        plm_idx = cl(l) * plm_mm1_mm1; // (m+1, m)
+        plm_mm1_mm1 = plm_idx;
+      } else if (l == (m + 1)) {
+        const double t = dl(l) * plm_mm1_mm1; // (m - 1, m - 1)
+        plm_idx = t * rz; // (m, m)
+      } else {
+        plm_idx = alm(idx_sph) * (rz * plm_idx1 + blm(idx_sph) * plm_idx2);
       }
+
+      ylm.re = phasem.re * plm_idx;
+      ylm.im = phasem.im * plm_idx;
+
+      for (int n = 0; n < nradmax; n++) {
+        Kokkos::atomic_add(&A_sph(ii, mu_j, idx_sph, n).re, fr(ii, jj, l, n) * ylm.re);
+        Kokkos::atomic_add(&A_sph(ii, mu_j, idx_sph, n).im, fr(ii, jj, l, n) * ylm.im);
+      }
+
+      plm_idx2 = plm_idx1;
+      plm_idx1 = plm_idx;
+
+      idx_sph++;
     }
   }
 
@@ -961,17 +1043,35 @@ template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEKokkos<DeviceType>::operator() (TagPairPACEConjugateAi, const int& ii) const
 {
-  //complex conjugate A's (for NEGATIVE (-m) terms)
-  // for rank > 1
   for (int mu_j = 0; mu_j < nelements; mu_j++) {
-    for (int n = 0; n < nradmax; n++) {
-      for (int l = 0; l <= lmax; l++) {
+
+    // transpose
+
+    int idx_sph = 0;
+
+    for (int m = 0; m <= lmax; m++) {
+      for (int l = m; l <= lmax; l++) {
+        const int idx = l * (l + 1) + m;
+        for (int n = 0; n < nradmax; n++) {
+          A(ii, mu_j, idx, n) = A_sph(ii, mu_j, idx_sph, n);
+        }
+
+        idx_sph++;
+      }
+    }
+
+    // complex conjugate A's (for NEGATIVE (-m) terms)
+    //  for rank > 1
+
+    for (int l = 0; l <= lmax; l++) {
         //fill in -m part in the outer loop using the same m <-> -m symmetry as for Ylm
-        for (int m = 1; m <= l; m++) {
-          const int idx = l * (l + 1) + m; // (l, m)
-          const int idxm = l * (l + 1) - m; // (l, -m)
-          const int factor = m % 2 == 0 ? 1 : -1;
-          A(ii, mu_j, n, idxm) = A(ii, mu_j, n, idx).conj() * (double)factor;
+      for (int m = 1; m <= l; m++) {
+        const int idx = l * (l + 1) + m; // (l, m)
+        const int idxm = l * (l + 1) - m; // (l, -m)
+        const int idx_sph = d_idx_sph(idx);
+        const int factor = m % 2 == 0 ? 1 : -1;
+        for (int n = 0; n < nradmax; n++) {
+          A(ii, mu_j, idxm, n) = A_sph(ii, mu_j, idx_sph, n).conj() * (double)factor;
         }
       }
     }
@@ -984,70 +1084,69 @@ template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeRho, const int& iter) const
 {
-  const int idx_rho = iter / chunk_size;
+  const int idx_ms_combs = iter / chunk_size;
   const int ii = iter % chunk_size;
 
   const int i = d_ilist[ii + chunk_offset];
   const int mu_i = d_map(type(i));
 
-  if (idx_rho >= d_idx_rho_count(mu_i)) return;
+  if (idx_ms_combs >= d_idx_ms_combs_count(mu_i)) return;
 
   const int ndensity = d_ndensity(mu_i);
 
-  const int offset = d_offsets(mu_i, idx_rho);
-  const int rank = d_rank(mu_i, offset);
+  const int idx_func = d_idx_funcs(mu_i, idx_ms_combs);
+  const int rank = d_rank(mu_i, idx_func);
   const int r = rank - 1;
 
   // Basis functions B with iterative product and density rho(p) calculation
   if (rank == 1) {
-    const int mu = d_mus(mu_i, offset, 0);
-    const int n = d_ns(mu_i, offset, 0);
+    const int mu = d_mus(mu_i, idx_func, 0);
+    const int n = d_ns(mu_i, idx_func, 0);
     double A_cur = A_rank1(ii, mu, n - 1);
     for (int p = 0; p < ndensity; ++p) {
       //for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1
-      Kokkos::atomic_add(&rhos(ii, p), d_ctildes(mu_i, idx_rho, p) * A_cur);
+      Kokkos::atomic_add(&rhos(ii, p), d_ctildes(mu_i, idx_ms_combs, p) * A_cur);
     }
   } else { // rank > 1
     // loop over {ms} combinations in sum
 
     // loop over m, collect B  = product of A with given ms
-    A_forward_prod(ii, idx_rho, 0) = complex::one();
+    A_forward_prod(ii, idx_ms_combs, 0) = complex::one();
 
     // fill forward A-product triangle
     for (int t = 0; t < rank; t++) {
       //TODO: optimize ns[t]-1 -> ns[t] during functions construction
-      const int mu = d_mus(mu_i, offset, t);
-      const int n = d_ns(mu_i, offset, t);
-      const int l = d_ls(mu_i, offset, t);
-      const int m = d_ms_combs(mu_i, idx_rho, t); // current ms-combination (of length = rank)
+      const int mu = d_mus(mu_i, idx_func, t);
+      const int n = d_ns(mu_i, idx_func, t);
+      const int l = d_ls(mu_i, idx_func, t);
+      const int m = d_ms_combs(mu_i, idx_ms_combs, t); // current ms-combination (of length = rank)
       const int idx = l * (l + 1) + m; // (l, m)
-      A_list(ii, idx_rho, t) = A(ii, mu, n - 1, idx);
-      A_forward_prod(ii, idx_rho, t + 1) = A_forward_prod(ii, idx_rho, t) * A_list(ii, idx_rho, t);
+      A_list(ii, idx_ms_combs, t) = A(ii, mu, idx, n - 1);
+      A_forward_prod(ii, idx_ms_combs, t + 1) = A_forward_prod(ii, idx_ms_combs, t) * A_list(ii, idx_ms_combs, t);
     }
 
     complex A_backward_prod = complex::one();
 
     // fill backward A-product triangle
     for (int t = r; t >= 1; t--) {
-      const complex dB = A_forward_prod(ii, idx_rho, t) * A_backward_prod; // dB - product of all A's except t-th
-      dB_flatten(ii, idx_rho, t) = dB;
+      const complex dB = A_forward_prod(ii, idx_ms_combs, t) * A_backward_prod; // dB - product of all A's except t-th
+      dB_flatten(ii, idx_ms_combs, t) = dB;
 
-      A_backward_prod = A_backward_prod * A_list(ii, idx_rho, t);
+      A_backward_prod = A_backward_prod * A_list(ii, idx_ms_combs, t);
     }
-    dB_flatten(ii, idx_rho, 0) = A_forward_prod(ii, idx_rho, 0) * A_backward_prod;
+    dB_flatten(ii, idx_ms_combs, 0) = A_forward_prod(ii, idx_ms_combs, 0) * A_backward_prod;
 
-    const complex B = A_forward_prod(ii, idx_rho, rank);
+    const complex B = A_forward_prod(ii, idx_ms_combs, rank);
 
     for (int p = 0; p < ndensity; ++p) {
       // real-part only multiplication
-      Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_ctildes(mu_i, idx_rho, p)));
+      Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_ctildes(mu_i, idx_ms_combs, p)));
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeFS, const int& ii) const
@@ -1064,34 +1163,35 @@ void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeFS, const int& ii
   evdwl = fcut = dfcut = 0.0;
 
   FS_values_and_derivatives(ii, evdwl, mu_i);
+
   if (is_zbl) {
-      if (d_jj_min(ii) != -1) {
-          const int mu_jmin = d_mu(ii,d_jj_min(ii));
-          F_FLOAT dcutin = d_dcut_in(mu_i, mu_jmin);
-          F_FLOAT transition_coordinate =  dcutin  - d_d_min(ii); // == cutin - r_min
-          cutoff_func_poly(transition_coordinate, dcutin, dcutin, fcut, dfcut);
-          dfcut = -dfcut; // invert, because rho_core = cutin - r_min
-      } else {
-          // no neighbours
-          fcut = 1;
-          dfcut = 0;
-      }
-      evdwl_cut = evdwl * fcut + rho_core(ii) * (1 - fcut); // evdwl * fcut + rho_core_uncut  - rho_core_uncut* fcut
-      dF_drho_core(ii) = 1 - fcut;
-      dF_dfcut(ii) = evdwl * dfcut - rho_core(ii) * dfcut;
+    if (d_jj_min(ii) != -1) {
+      const int mu_jmin = d_mu(ii,d_jj_min(ii));
+      F_FLOAT dcutin = d_dcut_in(mu_i, mu_jmin);
+      F_FLOAT transition_coordinate =  dcutin  - d_d_min(ii); // == cutin - r_min
+      cutoff_func_poly(transition_coordinate, dcutin, dcutin, fcut, dfcut);
+      dfcut = -dfcut; // invert, because rho_core = cutin - r_min
+    } else {
+      // no neighbours
+      fcut = 1;
+      dfcut = 0;
+    }
+    evdwl_cut = evdwl * fcut + rho_core(ii) * (1 - fcut); // evdwl * fcut + rho_core_uncut  - rho_core_uncut* fcut
+    dF_drho_core(ii) = 1 - fcut;
+    dF_dfcut(ii) = evdwl * dfcut - rho_core(ii) * dfcut;
   } else {
-      inner_cutoff(rho_core(ii), rho_cut, drho_cut, fcut, dfcut);
-      dF_drho_core(ii) = evdwl * dfcut + 1;
-      evdwl_cut = evdwl * fcut + rho_core(ii);
+    inner_cutoff(rho_core(ii), rho_cut, drho_cut, fcut, dfcut);
+    dF_drho_core(ii) = evdwl * dfcut + 1;
+    evdwl_cut = evdwl * fcut + rho_core(ii);
   }
   for (int p = 0; p < ndensity; ++p)
-     dF_drho(ii, p) *= fcut;
+    dF_drho(ii, p) *= fcut;
 
   // tally energy contribution
   if (eflag) {
-      // E0 shift
-      evdwl_cut += d_E0vals(mu_i);
-      e_atom(ii) = evdwl_cut;
+    // E0 shift
+    evdwl_cut += d_E0vals(mu_i);
+    e_atom(ii) = evdwl_cut;
   }
 
   if (flag_corerep_factor)
@@ -1104,52 +1204,58 @@ template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeWeights, const int& iter) const
 {
-  const int idx_rho = iter / chunk_size;
+  const int idx_ms_combs = iter / chunk_size;
   const int ii = iter % chunk_size;
 
   const int i = d_ilist[ii + chunk_offset];
   const int mu_i = d_map(type(i));
 
-  if (idx_rho >= d_idx_rho_count(mu_i)) return;
+  if (idx_ms_combs >= d_idx_ms_combs_count(mu_i)) return;
 
   const int ndensity = d_ndensity(mu_i);
 
-  const int offset = d_offsets(mu_i, idx_rho);
-  const int rank = d_rank(mu_i, offset);
+  const int idx_func = d_idx_funcs(mu_i, idx_ms_combs);
+  const int rank = d_rank(mu_i, idx_func);
 
   // Weights and theta calculation
 
   if (rank == 1) {
-    const int mu = d_mus(mu_i, offset, 0);
-    const int n = d_ns(mu_i, offset, 0);
+    const int mu = d_mus(mu_i, idx_func, 0);
+    const int n = d_ns(mu_i, idx_func, 0);
     double theta = 0.0;
     for (int p = 0; p < ndensity; ++p) {
       // for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1
-      theta += dF_drho(ii, p) * d_ctildes(mu_i, idx_rho, p);
+      theta += dF_drho(ii, p) * d_ctildes(mu_i, idx_ms_combs, p);
     }
     Kokkos::atomic_add(&weights_rank1(ii, mu, n - 1), theta);
   } else { // rank > 1
     double theta = 0.0;
     for (int p = 0; p < ndensity; ++p)
-      theta += dF_drho(ii, p) * d_ctildes(mu_i, idx_rho, p);
+      theta += dF_drho(ii, p) * d_ctildes(mu_i, idx_ms_combs, p);
 
     theta *= 0.5; // 0.5 factor due to possible double counting ???
     for (int t = 0; t < rank; ++t) {
-      const int m_t = d_ms_combs(mu_i, idx_rho, t);
+      const int m_t = d_ms_combs(mu_i, idx_ms_combs, t);
       const int factor = (m_t % 2 == 0 ? 1 : -1);
-      const complex dB = dB_flatten(ii, idx_rho, t);
-      const int mu_t = d_mus(mu_i, offset, t);
-      const int n_t = d_ns(mu_i, offset, t);
-      const int l_t = d_ls(mu_i, offset, t);
+      const complex dB = dB_flatten(ii, idx_ms_combs, t);
+      const int mu_t = d_mus(mu_i, idx_func, t);
+      const int n_t = d_ns(mu_i, idx_func, t);
+      const int l_t = d_ls(mu_i, idx_func, t);
       const int idx = l_t * (l_t + 1) + m_t; // (l, m)
-      const complex value = theta * dB;
-      Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idx).re), value.re);
-      Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idx).im), value.im);
+      const int idx_sph = d_idx_sph(idx);
+      if (idx_sph >= 0) {
+        const complex value = theta * dB;
+        Kokkos::atomic_add(&(weights(ii, mu_t, idx_sph, n_t - 1).re), value.re);
+        Kokkos::atomic_add(&(weights(ii, mu_t, idx_sph, n_t - 1).im), value.im);
+      }
       // update -m_t (that could also be positive), because the basis is half_basis
       const int idxm = l_t * (l_t + 1) - m_t; // (l, -m)
-      const complex valuem = theta * dB.conj() * (double)factor;
-      Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idxm).re), valuem.re);
-      Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idxm).im), valuem.im);
+      const int idxm_sph = d_idx_sph(idxm);
+      if (idxm_sph >= 0) {
+        const complex valuem = theta * dB.conj() * (double)factor;
+        Kokkos::atomic_add(&(weights(ii, mu_t, idxm_sph, n_t - 1).re), valuem.re);
+        Kokkos::atomic_add(&(weights(ii, mu_t, idxm_sph, n_t - 1).im), valuem.im);
+      }
     }
   }
 }
@@ -1196,37 +1302,239 @@ void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeDerivative, const
   }
 
   // for rank > 1
-  for (int n = 0; n < nradmax; n++) {
-    for (int l = 0; l <= lmax; l++) {
-      const double R_over_r = fr(ii, jj, n, l) * rinv;
-      const double DR = dfr(ii, jj, n, l);
 
-      // for m >= 0
-      for (int m = 0; m <= l; m++) {
-        const int idx = l * (l + 1) + m; // (l, m)
-        complex w = weights(ii, mu_j, n, idx);
+  // compute plm, dplm, ylm and dylm
+  // requires rx^2 + ry^2 + rz^2 = 1 , NO CHECKING IS PERFORMED !!!!!!!!!
+  // requires -1 <= rz <= 1 , NO CHECKING IS PERFORMED !!!!!!!!!
+  // prefactors include 1/sqrt(2) factor compared to reference
+
+  complex ylm,dylm[3];
+  complex phase;
+  complex phasem, mphasem1;
+  complex dyx, dyy, dyz;
+  complex rdy;
+
+  const double rx = d_rhats(ii, jj, 0);
+  const double ry = d_rhats(ii, jj, 1);
+  const double rz = d_rhats(ii, jj, 2);
+
+  phase.re = rx;
+  phase.im = ry;
+
+  double plm_idx,plm_idx1,plm_idx2;
+  double dplm_idx,dplm_idx1,dplm_idx2;
+
+  plm_idx = plm_idx1 = plm_idx2 = 0.0;
+  dplm_idx = dplm_idx1 = dplm_idx2 = 0.0;
+
+  int idx_sph = 0;
+
+  // m = 0
+  for (int l = 0; l <= lmax; l++) {
+    // const int idx = l * (l + 1);
+
+    if (l == 0) {
+      // l=0, m=0
+      // plm[0] = Y00/sq1o4pi; //= sq1o4pi;
+      plm_idx = Y00; //= 1;
+      dplm_idx = 0.0;
+    } else if (l == 1) {
+      // l=1, m=0
+      plm_idx = Y00 * sq3 * rz;
+      dplm_idx = Y00 * sq3;
+    } else {
+      // l>=2, m=0
+      plm_idx = alm(idx_sph) * (rz * plm_idx1 + blm(idx_sph) * plm_idx2);
+      dplm_idx = alm(idx_sph) * (plm_idx1 + rz * dplm_idx1 + blm(idx_sph) * dplm_idx2);
+    }
+
+    ylm.re = plm_idx;
+    ylm.im = 0.0;
+
+    dyz.re = dplm_idx;
+    rdy.re = dyz.re * rz;
+
+    dylm[0].re = -rdy.re * rx;
+    dylm[0].im = 0.0;
+    dylm[1].re = -rdy.re * ry;
+    dylm[1].im = 0.0;
+    dylm[2].re = dyz.re - rdy.re * rz;
+    dylm[2].im = 0;
+
+    for (int n = 0; n < nradmax; n++) {
+
+      const double R_over_r = fr(ii, jj, l, n) * rinv;
+      const double DR = dfr(ii, jj, l, n);
+      const complex Y_DR = ylm * DR;
+
+      complex w = weights(ii, mu_j, idx_sph, n);
+      if (w.re == 0.0 && w.im == 0.0) continue;
+
+      complex grad_phi_nlm[3];
+      grad_phi_nlm[0] = Y_DR * r_hat[0] + dylm[0] * R_over_r;
+      grad_phi_nlm[1] = Y_DR * r_hat[1] + dylm[1] * R_over_r;
+      grad_phi_nlm[2] = Y_DR * r_hat[2] + dylm[2] * R_over_r;
+      // real-part multiplication only
+      f_ji[0] += w.real_part_product(grad_phi_nlm[0]);
+      f_ji[1] += w.real_part_product(grad_phi_nlm[1]);
+      f_ji[2] += w.real_part_product(grad_phi_nlm[2]);
+    }
+
+    plm_idx2 = plm_idx1;
+    dplm_idx2 = dplm_idx1;
+
+    plm_idx1 = plm_idx;
+    dplm_idx1 = dplm_idx;
+
+    idx_sph++;
+  }
+
+  plm_idx = plm_idx1 = plm_idx2 = 0.0;
+  dplm_idx = dplm_idx1 = dplm_idx2 = 0.0;
+
+  // m = 1
+  for (int l = 1; l <= lmax; l++) {
+    // const int idx = l * (l + 1) + 1; // (l, 1)
+
+    if (l == 1) {
+      // l=1, m=1
+      plm_idx = -sq3o2 * Y00;
+      dplm_idx = 0.0;
+    } else if (l == 2) {
+      const double t = dl(l) * plm_idx1;
+      plm_idx = t * rz;
+      dplm_idx = t;
+    } else {
+      plm_idx = alm(idx_sph) * (rz * plm_idx1 + blm(idx_sph) * plm_idx2);
+      dplm_idx = alm(idx_sph) * (plm_idx1 + rz * dplm_idx1 + blm(idx_sph) * dplm_idx2);
+    }
+
+    ylm = phase * plm_idx;
+
+    dyx.re = plm_idx;
+    dyx.im = 0.0;
+    dyy.re = 0.0;
+    dyy.im = plm_idx;
+    dyz.re = phase.re * dplm_idx;
+    dyz.im = phase.im * dplm_idx;
+
+    rdy.re = rx * dyx.re + +rz * dyz.re;
+    rdy.im = ry * dyy.im + rz * dyz.im;
+
+    dylm[0].re = dyx.re - rdy.re * rx;
+    dylm[0].im = -rdy.im * rx;
+    dylm[1].re = -rdy.re * ry;
+    dylm[1].im = dyy.im - rdy.im * ry;
+    dylm[2].re = dyz.re - rdy.re * rz;
+    dylm[2].im = dyz.im - rdy.im * rz;
+
+    for (int n = 0; n < nradmax; n++) {
+
+      const double R_over_r = fr(ii, jj, l, n) * rinv;
+      const double DR = dfr(ii, jj, l, n);
+      const complex Y_DR = ylm * DR;
+
+      complex w = weights(ii, mu_j, idx_sph, n);
+      if (w.re == 0.0 && w.im == 0.0) continue;
+      // counting for -m cases if m > 0
+      w.re *= 2.0;
+      w.im *= 2.0;
+
+      complex grad_phi_nlm[3];
+      grad_phi_nlm[0] = Y_DR * r_hat[0] + dylm[0] * R_over_r;
+      grad_phi_nlm[1] = Y_DR * r_hat[1] + dylm[1] * R_over_r;
+      grad_phi_nlm[2] = Y_DR * r_hat[2] + dylm[2] * R_over_r;
+      // real-part multiplication only
+      f_ji[0] += w.real_part_product(grad_phi_nlm[0]);
+      f_ji[1] += w.real_part_product(grad_phi_nlm[1]);
+      f_ji[2] += w.real_part_product(grad_phi_nlm[2]);
+    }
+
+    plm_idx2 = plm_idx1;
+    dplm_idx2 = dplm_idx1;
+
+    plm_idx1 = plm_idx;
+    dplm_idx1 = dplm_idx;
+
+    idx_sph++;
+  }
+
+  plm_idx = plm_idx1 = plm_idx2 = 0.0;
+  dplm_idx = dplm_idx1 = dplm_idx2 = 0.0;
+
+  double plm_mm1_mm1 = -sq3o2 * Y00; // (1, 1)
+
+  // m > 1
+  phasem = phase;
+  for (int m = 2; m <= lmax; m++) {
+
+    mphasem1.re = phasem.re * double(m);
+    mphasem1.im = phasem.im * double(m);
+    phasem = phasem * phase;
+
+    for (int l = m; l <= lmax; l++) {
+      // const int idx = l * (l + 1) + m;
+
+      if (l == m) {
+        plm_idx = cl(l) * plm_mm1_mm1; // (m+1, m)
+        dplm_idx = 0.0;
+        plm_mm1_mm1 = plm_idx;
+      } else if (l == (m + 1)) {
+        const double t = dl(l) * plm_mm1_mm1; // (m - 1, m - 1)
+        plm_idx = t * rz; // (m, m)
+        dplm_idx = t;
+      } else {
+        plm_idx = alm(idx_sph) * (rz * plm_idx1 + blm(idx_sph) * plm_idx2);
+        dplm_idx = alm(idx_sph) * (plm_idx1 + rz * dplm_idx1 + blm(idx_sph) * dplm_idx2);
+      }
+
+      ylm.re = phasem.re * plm_idx;
+      ylm.im = phasem.im * plm_idx;
+
+      dyx = mphasem1 * plm_idx;
+      dyy.re = -dyx.im;
+      dyy.im = dyx.re;
+      dyz = phasem * dplm_idx;
+
+      rdy.re = rx * dyx.re + ry * dyy.re + rz * dyz.re;
+      rdy.im = rx * dyx.im + ry * dyy.im + rz * dyz.im;
+
+      dylm[0].re = dyx.re - rdy.re * rx;
+      dylm[0].im = dyx.im - rdy.im * rx;
+      dylm[1].re = dyy.re - rdy.re * ry;
+      dylm[1].im = dyy.im - rdy.im * ry;
+      dylm[2].re = dyz.re - rdy.re * rz;
+      dylm[2].im = dyz.im - rdy.im * rz;
+
+      for (int n = 0; n < nradmax; n++) {
+
+        const double R_over_r = fr(ii, jj, l, n) * rinv;
+        const double DR = dfr(ii, jj, l, n);
+        const complex Y_DR = ylm * DR;
+
+        complex w = weights(ii, mu_j, idx_sph, n);
         if (w.re == 0.0 && w.im == 0.0) continue;
         // counting for -m cases if m > 0
-        if (m > 0) {
-          w.re *= 2.0;
-          w.im *= 2.0;
-        }
-
-        complex DY[3];
-        DY[0] = dylm(ii, jj, idx, 0);
-        DY[1] = dylm(ii, jj, idx, 1);
-        DY[2] = dylm(ii, jj, idx, 2);
-        const complex Y_DR = ylm(ii, jj, idx) * DR;
+        w.re *= 2.0;
+        w.im *= 2.0;
 
         complex grad_phi_nlm[3];
-        grad_phi_nlm[0] = Y_DR * r_hat[0] + DY[0] * R_over_r;
-        grad_phi_nlm[1] = Y_DR * r_hat[1] + DY[1] * R_over_r;
-        grad_phi_nlm[2] = Y_DR * r_hat[2] + DY[2] * R_over_r;
+        grad_phi_nlm[0] = Y_DR * r_hat[0] + dylm[0] * R_over_r;
+        grad_phi_nlm[1] = Y_DR * r_hat[1] + dylm[1] * R_over_r;
+        grad_phi_nlm[2] = Y_DR * r_hat[2] + dylm[2] * R_over_r;
         // real-part multiplication only
         f_ji[0] += w.real_part_product(grad_phi_nlm[0]);
         f_ji[1] += w.real_part_product(grad_phi_nlm[1]);
         f_ji[2] += w.real_part_product(grad_phi_nlm[2]);
       }
+
+      plm_idx2 = plm_idx1;
+      dplm_idx2 = dplm_idx1;
+
+      plm_idx1 = plm_idx;
+      dplm_idx1 = dplm_idx;
+
+      idx_sph++;
     }
   }
 
@@ -1238,10 +1546,10 @@ void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeDerivative, const
 
   if (is_zbl) {
     if (jj==d_jj_min(ii)) {
-        // DCRU = 1.0
-        f_ij(ii, jj, 0) += dF_dfcut(ii) * r_hat[0];
-        f_ij(ii, jj, 1) += dF_dfcut(ii) * r_hat[1];
-        f_ij(ii, jj, 2) += dF_dfcut(ii) * r_hat[2];
+      // DCRU = 1.0
+      f_ij(ii, jj, 0) += dF_dfcut(ii) * r_hat[0];
+      f_ij(ii, jj, 1) += dF_dfcut(ii) * r_hat[1];
+      f_ij(ii, jj, 2) += dF_dfcut(ii) * r_hat[2];
     }
   }
 }
@@ -1364,31 +1672,46 @@ void PairPACEKokkos<DeviceType>::v_tally_xyz(EV_FLOAT &ev, const int &i, const i
 template<class DeviceType>
 void PairPACEKokkos<DeviceType>::pre_compute_harmonics(int lmax)
 {
+  auto h_idx_sph = Kokkos::create_mirror_view(d_idx_sph);
   auto h_alm = Kokkos::create_mirror_view(alm);
   auto h_blm = Kokkos::create_mirror_view(blm);
   auto h_cl = Kokkos::create_mirror_view(cl);
   auto h_dl = Kokkos::create_mirror_view(dl);
 
-  for (int l = 1; l <= lmax; l++) {
-    const double lsq = l * l;
-    const double ld = 2 * l;
-    const double l1 = (4 * lsq - 1);
-    const double l2 = lsq - ld + 1;
-    for (int m = 0; m < l - 1; m++) {
-      const double msq = m * m;
-      const double a = sqrt((double(l1)) / (double(lsq - msq)));
-      const double b = -sqrt((double(l2 - msq)) / (double(4 * l2 - 1)));
+  Kokkos::deep_copy(h_idx_sph,-1);
+
+  int idx_sph = 0;
+  for (int m = 0; m <= lmax; m++) {
+    const double msq = m * m;
+    for (int l = m; l <= lmax; l++) {
       const int idx = l * (l + 1) + m; // (l, m)
-      h_alm(idx) = a;
-      h_blm(idx) = b;
+      h_idx_sph(idx) = idx_sph;
+
+      double a = 0.0;
+      double b = 0.0;
+
+      if (l > 1 && l != m) {
+        const double lsq = l * l;
+        const double ld = 2 * l;
+        const double l1 = (4 * lsq - 1);
+        const double l2 = lsq - ld + 1;
+
+        a = sqrt((double(l1)) / (double(lsq - msq)));
+        b = -sqrt((double(l2 - msq)) / (double(4 * l2 - 1)));
+      }
+      h_alm(idx_sph) = a;
+      h_blm(idx_sph) = b;
+      idx_sph++;
     }
   }
+  idx_sph_max = idx_sph;
 
   for (int l = 1; l <= lmax; l++) {
     h_cl(l) = -sqrt(1.0 + 0.5 / (double(l)));
     h_dl(l) = sqrt(double(2 * (l - 1) + 3));
   }
 
+  Kokkos::deep_copy(d_idx_sph, h_idx_sph);
   Kokkos::deep_copy(alm, h_alm);
   Kokkos::deep_copy(blm, h_blm);
   Kokkos::deep_copy(cl, h_cl);
@@ -1397,143 +1720,6 @@ void PairPACEKokkos<DeviceType>::pre_compute_harmonics(int lmax)
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairPACEKokkos<DeviceType>::compute_barplm(int ii, int jj, double rz, int lmax) const
-{
-  // requires -1 <= rz <= 1 , NO CHECKING IS PERFORMED !!!!!!!!!
-  // prefactors include 1/sqrt(2) factor compared to reference
-
-  // l=0, m=0
-  // plm(ii, jj, 0, 0) = Y00/sq1o4pi; //= sq1o4pi;
-  plm(ii, jj, 0) = Y00; //= 1;
-  dplm(ii, jj, 0) = 0.0;
-
-  if (lmax > 0) {
-
-    // l=1, m=0
-    plm(ii, jj, 2) = Y00 * sq3 * rz;
-    dplm(ii, jj, 2) = Y00 * sq3;
-
-    // l=1, m=1
-    plm(ii, jj, 3) = -sq3o2 * Y00;
-    dplm(ii, jj, 3) = 0.0;
-
-    // loop l = 2, lmax
-    for (int l = 2; l <= lmax; l++) {
-      for (int m = 0; m < l - 1; m++) {
-        const int idx = l * (l + 1) + m; // (l, m)
-        const int idx1 = (l - 1) * l + m; // (l - 1, m)
-        const int idx2 = (l - 2) * (l - 1) + m; // (l - 2, m)
-        plm(ii, jj, idx) = alm(idx) * (rz * plm(ii, jj, idx1) + blm(idx) * plm(ii, jj, idx2));
-        dplm(ii, jj, idx) = alm(idx) * (plm(ii, jj, idx1) + rz * dplm(ii, jj, idx1) + blm(idx) * dplm(ii, jj, idx2));
-      }
-      const int idx = l * (l + 1) + l; // (l, l)
-      const int idx1 = l * (l + 1) + l - 1; // (l, l - 1)
-      const int idx2 = (l - 1) * l + l - 1; // (l - 1, l - 1)
-      const double t = dl(l) * plm(ii, jj, idx2);
-      plm(ii, jj, idx1) = t * rz;
-      dplm(ii, jj, idx1) = t;
-      plm(ii, jj, idx) = cl(l) * plm(ii, jj, idx2);
-      dplm(ii, jj, idx) = 0.0;
-    }
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairPACEKokkos<DeviceType>::compute_ylm(int ii, int jj, double rx, double ry, double rz, int lmax) const
-{
-  // requires rx^2 + ry^2 + rz^2 = 1 , NO CHECKING IS PERFORMED !!!!!!!!!
-
-  complex phase;
-  complex phasem, mphasem1;
-  complex dyx, dyy, dyz;
-  complex rdy;
-
-  phase.re = rx;
-  phase.im = ry;
-
-  // compute barplm
-  compute_barplm(ii, jj, rz, lmax);
-
-  // m = 0
-  for (int l = 0; l <= lmax; l++) {
-    const int idx = l * (l + 1);
-
-    ylm(ii, jj, idx).re = plm(ii, jj, idx);
-    ylm(ii, jj, idx).im = 0.0;
-
-    dyz.re = dplm(ii, jj, idx);
-    rdy.re = dyz.re * rz;
-
-    dylm(ii, jj, idx, 0).re = -rdy.re * rx;
-    dylm(ii, jj, idx, 0).im = 0.0;
-    dylm(ii, jj, idx, 1).re = -rdy.re * ry;
-    dylm(ii, jj, idx, 1).im = 0.0;
-    dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz;
-    dylm(ii, jj, idx, 2).im = 0;
-  }
-  // m = 1
-  for (int l = 1; l <= lmax; l++) {
-    const int idx = l * (l + 1) + 1;
-
-    ylm(ii, jj, idx) = phase * plm(ii, jj, idx);
-
-    dyx.re = plm(ii, jj, idx);
-    dyx.im = 0.0;
-    dyy.re = 0.0;
-    dyy.im = plm(ii, jj, idx);
-    dyz.re = phase.re * dplm(ii, jj, idx);
-    dyz.im = phase.im * dplm(ii, jj, idx);
-
-    rdy.re = rx * dyx.re + +rz * dyz.re;
-    rdy.im = ry * dyy.im + rz * dyz.im;
-
-    dylm(ii, jj, idx, 0).re = dyx.re - rdy.re * rx;
-    dylm(ii, jj, idx, 0).im = -rdy.im * rx;
-    dylm(ii, jj, idx, 1).re = -rdy.re * ry;
-    dylm(ii, jj, idx, 1).im = dyy.im - rdy.im * ry;
-    dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz;
-    dylm(ii, jj, idx, 2).im = dyz.im - rdy.im * rz;
-  }
-
-  // m > 1
-  phasem = phase;
-  for (int m = 2; m <= lmax; m++) {
-
-    mphasem1.re = phasem.re * double(m);
-    mphasem1.im = phasem.im * double(m);
-    phasem = phasem * phase;
-
-    for (int l = m; l <= lmax; l++) {
-      const int idx = l * (l + 1) + m;
-
-      ylm(ii, jj, idx).re = phasem.re * plm(ii, jj, idx);
-      ylm(ii, jj, idx).im = phasem.im * plm(ii, jj, idx);
-
-      dyx = mphasem1 * plm(ii, jj, idx);
-      dyy.re = -dyx.im;
-      dyy.im = dyx.re;
-      dyz = phasem * dplm(ii, jj, idx);
-
-      rdy.re = rx * dyx.re + ry * dyy.re + rz * dyz.re;
-      rdy.im = rx * dyx.im + ry * dyy.im + rz * dyz.im;
-
-      dylm(ii, jj, idx, 0).re = dyx.re - rdy.re * rx;
-      dylm(ii, jj, idx, 0).im = dyx.im - rdy.im * rx;
-      dylm(ii, jj, idx, 1).re = dyy.re - rdy.re * ry;
-      dylm(ii, jj, idx, 1).im = dyy.im - rdy.im * ry;
-      dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz;
-      dylm(ii, jj, idx, 2).im = dyz.im - rdy.im * rz;
-    }
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEKokkos<DeviceType>::cutoff_func_poly(const double r, const double r_in, const double delta_in, double &fc, double &dfc) const
@@ -1662,11 +1848,11 @@ void PairPACEKokkos<DeviceType>::evaluate_splines(const int ii, const int jj, do
   spline_gk.calcSplines(ii, jj, r, gr, dgr);
 
   spline_rnl.calcSplines(ii, jj, r, d_values, d_derivatives);
-  for (int kk = 0; kk < (int)fr.extent(2); kk++) {
-    for (int ll = 0; ll < (int)fr.extent(3); ll++) {
-      const int flatten = kk*fr.extent(3) + ll;
-      fr(ii, jj, kk, ll) = d_values(ii, jj, flatten);
-      dfr(ii, jj, kk, ll) = d_derivatives(ii, jj, flatten);
+  for (int ll = 0; ll < (int)fr.extent(2); ll++) {
+    for (int kk = 0; kk < (int)fr.extent(3); kk++) {
+      const int flatten = kk*fr.extent(2) + ll;
+      fr(ii, jj, ll, kk) = d_values(ii, jj, flatten);
+      dfr(ii, jj, ll, kk) = d_derivatives(ii, jj, flatten);
     }
   }
 
@@ -1686,7 +1872,7 @@ void PairPACEKokkos<DeviceType>::SplineInterpolatorKokkos::operator=(const Splin
     rscalelookup = spline.rscalelookup;
     num_of_functions = spline.num_of_functions;
 
-    lookupTable = t_ace_3d4("lookupTable", ntot+1, num_of_functions);
+    lookupTable = t_ace_3d4_lr("lookupTable", ntot+1, num_of_functions);
     auto h_lookupTable = Kokkos::create_mirror_view(lookupTable);
     for (int i = 0; i < ntot+1; i++)
         for (int j = 0; j < num_of_functions; j++)
@@ -1792,10 +1978,6 @@ double PairPACEKokkos<DeviceType>::memory_usage()
   bytes += MemKK::memory_usage(d_derivatives);
   bytes += MemKK::memory_usage(cr);
   bytes += MemKK::memory_usage(dcr);
-  bytes += MemKK::memory_usage(plm);
-  bytes += MemKK::memory_usage(dplm);
-  bytes += MemKK::memory_usage(ylm);
-  bytes += MemKK::memory_usage(dylm);
   bytes += MemKK::memory_usage(d_ncount);
   bytes += MemKK::memory_usage(d_mu);
   bytes += MemKK::memory_usage(d_rhats);
@@ -1811,10 +1993,10 @@ double PairPACEKokkos<DeviceType>::memory_usage()
   bytes += MemKK::memory_usage(d_npoti);
   bytes += MemKK::memory_usage(d_wpre);
   bytes += MemKK::memory_usage(d_mexp);
-  bytes += MemKK::memory_usage(d_idx_rho_count);
+  bytes += MemKK::memory_usage(d_idx_ms_combs_count);
   bytes += MemKK::memory_usage(d_rank);
   bytes += MemKK::memory_usage(d_num_ms_combs);
-  bytes += MemKK::memory_usage(d_offsets);
+  bytes += MemKK::memory_usage(d_idx_funcs);
   bytes += MemKK::memory_usage(d_mus);
   bytes += MemKK::memory_usage(d_ns);
   bytes += MemKK::memory_usage(d_ls);
diff --git a/src/KOKKOS/pair_pace_kokkos.h b/src/KOKKOS/pair_pace_kokkos.h
index 36486f8628..e22c61f0ea 100644
--- a/src/KOKKOS/pair_pace_kokkos.h
+++ b/src/KOKKOS/pair_pace_kokkos.h
@@ -36,7 +36,6 @@ class PairPACEKokkos : public PairPACE {
  public:
   struct TagPairPACEComputeNeigh{};
   struct TagPairPACEComputeRadial{};
-  struct TagPairPACEComputeYlm{};
   struct TagPairPACEComputeAi{};
   struct TagPairPACEConjugateAi{};
   struct TagPairPACEComputeRho{};
@@ -66,9 +65,6 @@ class PairPACEKokkos : public PairPACE {
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairPACEComputeRadial,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeRadial>::member_type& team) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairPACEComputeYlm,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeYlm>::member_type& team) const;
-
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairPACEComputeAi,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeAi>::member_type& team) const;
 
@@ -96,7 +92,7 @@ class PairPACEKokkos : public PairPACE {
   void operator() (TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>,const int& ii, EV_FLOAT&) const;
 
  protected:
-  int inum, maxneigh, chunk_size, chunk_offset, idx_rho_max;
+  int inum, maxneigh, chunk_size, chunk_offset, idx_ms_combs_max, idx_sph_max;
   int host_flag;
 
   int eflag, vflag;
@@ -157,12 +153,6 @@ class PairPACEKokkos : public PairPACE {
       const F_FLOAT &fx, const F_FLOAT &fy, const F_FLOAT &fz,
       const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void compute_barplm(int, int, double, int) const;
-
-  KOKKOS_INLINE_FUNCTION
-  void compute_ylm(int, int, double, double, double, int) const;
-
   KOKKOS_INLINE_FUNCTION
   void cutoff_func_poly(const double, const double, const double, double &, double &) const;
 
@@ -194,14 +184,18 @@ class PairPACEKokkos : public PairPACE {
 
   typedef Kokkos::View<int*, DeviceType> t_ace_1i;
   typedef Kokkos::View<int**, DeviceType> t_ace_2i;
+  typedef Kokkos::View<int**, Kokkos::LayoutRight, DeviceType> t_ace_2i_lr;
   typedef Kokkos::View<int***, DeviceType> t_ace_3i;
+  typedef Kokkos::View<int***, Kokkos::LayoutRight, DeviceType> t_ace_3i_lr;
   typedef Kokkos::View<int****, DeviceType> t_ace_4i;
   typedef Kokkos::View<double*, DeviceType> t_ace_1d;
   typedef Kokkos::View<double**, DeviceType> t_ace_2d;
+  typedef Kokkos::View<double**, Kokkos::LayoutRight, DeviceType> t_ace_2d_lr;
   typedef Kokkos::View<double*[3], DeviceType> t_ace_2d3;
   typedef Kokkos::View<double***, DeviceType> t_ace_3d;
   typedef Kokkos::View<double**[3], DeviceType> t_ace_3d3;
   typedef Kokkos::View<double**[4], DeviceType> t_ace_3d4;
+  typedef Kokkos::View<double**[4], Kokkos::LayoutRight, DeviceType> t_ace_3d4_lr;
   typedef Kokkos::View<double****, DeviceType> t_ace_4d;
   typedef Kokkos::View<complex*, DeviceType> t_ace_1c;
   typedef Kokkos::View<complex**, DeviceType> t_ace_2c;
@@ -248,23 +242,13 @@ class PairPACEKokkos : public PairPACE {
 
   void pre_compute_harmonics(int);
 
-  KOKKOS_INLINE_FUNCTION
-  void compute_barplm(double rz, int lmaxi);
-
-  KOKKOS_INLINE_FUNCTION
-  void compute_ylm(double rx, double ry, double rz, int lmaxi);
-
+  t_ace_4c A_sph;
+  t_ace_1d d_idx_sph;
   t_ace_1d alm;
   t_ace_1d blm;
   t_ace_1d cl;
   t_ace_1d dl;
 
-  t_ace_3d plm;
-  t_ace_3d dplm;
-
-  t_ace_3c ylm;
-  t_ace_4c3 dylm;
-
   // short neigh list
   t_ace_1i d_ncount;
   t_ace_2d d_mu;
@@ -283,18 +267,18 @@ class PairPACEKokkos : public PairPACE {
   t_ace_1d d_rho_core_cutoff;
   t_ace_1d d_drho_core_cutoff;
   t_ace_1d d_E0vals;
-  t_ace_2d d_wpre;
-  t_ace_2d d_mexp;
+  t_ace_2d_lr d_wpre;
+  t_ace_2d_lr d_mexp;
 
   // tilde
-  t_ace_1i d_idx_rho_count;
-  t_ace_2i d_rank;
-  t_ace_2i d_num_ms_combs;
-  t_ace_2i d_offsets;
-  t_ace_3i d_mus;
-  t_ace_3i d_ns;
-  t_ace_3i d_ls;
-  t_ace_3i d_ms_combs;
+  t_ace_1i d_idx_ms_combs_count;
+  t_ace_2i_lr d_rank;
+  t_ace_2i_lr d_num_ms_combs;
+  t_ace_2i_lr d_idx_funcs;
+  t_ace_3i_lr d_mus;
+  t_ace_3i_lr d_ns;
+  t_ace_3i_lr d_ls;
+  t_ace_3i_lr d_ms_combs;
   t_ace_3d d_ctildes;
 
   t_ace_3d3 f_ij;
@@ -304,12 +288,12 @@ class PairPACEKokkos : public PairPACE {
     int ntot, nlut, num_of_functions;
     double cutoff, deltaSplineBins, invrscalelookup, rscalelookup;
 
-    t_ace_3d4 lookupTable;
+    t_ace_3d4_lr lookupTable;
 
     void operator=(const SplineInterpolator &spline);
 
     void deallocate() {
-      lookupTable = t_ace_3d4();
+      lookupTable = t_ace_3d4_lr();
     }
 
     double memory_usage() {
diff --git a/src/KOKKOS/pair_reaxff_kokkos.cpp b/src/KOKKOS/pair_reaxff_kokkos.cpp
index 11a40970c2..505681acb3 100644
--- a/src/KOKKOS/pair_reaxff_kokkos.cpp
+++ b/src/KOKKOS/pair_reaxff_kokkos.cpp
@@ -1598,7 +1598,6 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsHalfBlocking<
   F_FLOAT dDeltap_self_i[3] = {0.0,0.0,0.0};
   F_FLOAT total_bo_i = 0.0;
 
-  int j_index,i_index;
   d_bo_first[i] = i*maxbo;
   const int bo_first_i = d_bo_first[i];
 
@@ -1675,7 +1674,7 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsHalfBlocking<
 
       int ii_index = -1;
       int jj_index = -1;
-      if (build_bo_list<NEIGHFLAG>(bo_first_i, i, j, i_index, j_index, ii_index, jj_index)) {
+      if (build_bo_list<NEIGHFLAG>(bo_first_i, i, j, ii_index, jj_index)) {
 
         // from BondOrder1
 
@@ -1743,7 +1742,6 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsHalfBlockingP
 
   F_FLOAT C12, C34, C56, BO_s, BO_pi, BO_pi2, BO, delij[3];
 
-  int j_index,i_index;
   d_bo_first[i] = i*maxbo;
   const int bo_first_i = d_bo_first[i];
 
@@ -1821,7 +1819,7 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsHalfBlockingP
 
       int ii_index = -1;
       int jj_index = -1;
-      build_bo_list<NEIGHFLAG>(bo_first_i, i, j, i_index, j_index, ii_index, jj_index);
+      build_bo_list<NEIGHFLAG>(bo_first_i, i, j, ii_index, jj_index);
     }
   }
 }
@@ -1842,7 +1840,6 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsHalfPreview<N
 
   F_FLOAT C12, C34, C56, BO_s, BO_pi, BO_pi2, BO, delij[3];
 
-  int j_index,i_index;
   d_bo_first[i] = i*maxbo;
   const int bo_first_i = d_bo_first[i];
 
@@ -1891,7 +1888,7 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsHalfPreview<N
     int ii_index = -1;
     int jj_index = -1;
 
-    build_bo_list<NEIGHFLAG>(bo_first_i, i, j, i_index, j_index, ii_index, jj_index);
+    build_bo_list<NEIGHFLAG>(bo_first_i, i, j, ii_index, jj_index);
   }
 }
 
@@ -1942,7 +1939,8 @@ void PairReaxFFKokkos<DeviceType>::build_hb_list(F_FLOAT rsq, int i, int hb_firs
 template<class DeviceType>
 template<int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
-bool PairReaxFFKokkos<DeviceType>::build_bo_list(int bo_first_i, int i, int j, int i_index, int j_index, int& ii_index, int& jj_index) const {
+bool PairReaxFFKokkos<DeviceType>::build_bo_list(int bo_first_i, int i, int j, int& ii_index, int& jj_index) const {
+   int i_index, j_index;
 
   if (NEIGHFLAG == HALF) {
     j_index = bo_first_i + d_bo_num[i];
@@ -2509,8 +2507,6 @@ void PairReaxFFKokkos<DeviceType>::compute_angular_sbo(int i, int itype, int j_s
   F_FLOAT prod_SBO = 1.0;
 
   for (int jj = j_start; jj < j_end; jj++) {
-    int j = d_bo_list[jj];
-    j &= NEIGHMASK;
     const int j_index = jj - j_start;
     const F_FLOAT bo_ij = d_BO(i,j_index);
 
@@ -2919,8 +2915,6 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxComputeAngularPreproces
   a_CdDelta[k] += CEcoa5;
 
   for (int ll = j_start; ll < j_end; ll++) {
-    int l = d_bo_list[ll];
-    l &= NEIGHMASK;
     const int l_index = ll - j_start;
 
     temp_bo_jt = d_BO(i,l_index);
diff --git a/src/KOKKOS/pair_reaxff_kokkos.h b/src/KOKKOS/pair_reaxff_kokkos.h
index fba7c03ec4..5f228ebd19 100644
--- a/src/KOKKOS/pair_reaxff_kokkos.h
+++ b/src/KOKKOS/pair_reaxff_kokkos.h
@@ -185,7 +185,7 @@ class PairReaxFFKokkos : public PairReaxFF {
   // Returns if we need to populate d_d* functions or not
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
-  bool build_bo_list(int, int, int, int, int, int&, int&) const;
+  bool build_bo_list(int, int, int, int&, int&) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairReaxBuildListsFull, const int&) const;
@@ -526,7 +526,7 @@ struct PairReaxKokkosFindBondFunctor  {
   typedef int value_type;
   int groupbit;
   PairReaxFFKokkos<DeviceType> c;
-  PairReaxKokkosFindBondFunctor(PairReaxFFKokkos<DeviceType>* c_ptr, int groupbit):c(*c_ptr),groupbit(groupbit) {};
+  PairReaxKokkosFindBondFunctor(PairReaxFFKokkos<DeviceType>* c_ptr, int groupbit):groupbit(groupbit),c(*c_ptr){};
 
   KOKKOS_INLINE_FUNCTION
   void join(int &dst,
diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
index 2a53682df3..36d5974c6d 100644
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -1371,8 +1371,6 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_brick2fft, const int &ii) const
 template<class DeviceType>
 void PPPMKokkos<DeviceType>::poisson_ik()
 {
-  int j;
-
   // transform charge density (r -> k)
 
   copymode = 1;
@@ -1383,7 +1381,8 @@ void PPPMKokkos<DeviceType>::poisson_ik()
 
   // global energy and virial contribution
 
-  scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  scaleinv = 1.0/ngridtotal;
   s2 = scaleinv*scaleinv;
 
   if (eflag_global || vflag_global) {
@@ -1392,7 +1391,7 @@ void PPPMKokkos<DeviceType>::poisson_ik()
       copymode = 1;
       Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik2>(0,nfft),*this,ev);
       copymode = 0;
-      for (j = 0; j < 6; j++) virial[j] += ev.v[j];
+      for (int j = 0; j < 6; j++) virial[j] += ev.v[j];
       energy += ev.ecoul;
     } else {
       copymode = 1;
diff --git a/src/KOKKOS/third_order_kokkos.cpp b/src/KOKKOS/third_order_kokkos.cpp
index 6208aa966a..04c467777f 100644
--- a/src/KOKKOS/third_order_kokkos.cpp
+++ b/src/KOKKOS/third_order_kokkos.cpp
@@ -174,72 +174,45 @@ void ThirdOrderKokkos::update_force()
   }
 
   bool execute_on_host = false;
-  unsigned int datamask_read_device = 0;
-  unsigned int datamask_modify_device = 0;
   unsigned int datamask_read_host = 0;
 
   if (pair_compute_flag) {
     if (force->pair->execution_space==Host) {
       execute_on_host  = true;
       datamask_read_host   |= force->pair->datamask_read;
-      datamask_modify_device |= force->pair->datamask_modify;
-    } else {
-      datamask_read_device   |= force->pair->datamask_read;
-      datamask_modify_device |= force->pair->datamask_modify;
     }
   }
   if (atomKK->molecular && force->bond)  {
     if (force->bond->execution_space==Host) {
       execute_on_host  = true;
       datamask_read_host   |= force->bond->datamask_read;
-      datamask_modify_device |= force->bond->datamask_modify;
-    } else {
-      datamask_read_device   |= force->bond->datamask_read;
-      datamask_modify_device |= force->bond->datamask_modify;
     }
   }
   if (atomKK->molecular && force->angle) {
     if (force->angle->execution_space==Host) {
       execute_on_host  = true;
       datamask_read_host   |= force->angle->datamask_read;
-      datamask_modify_device |= force->angle->datamask_modify;
-    } else {
-      datamask_read_device   |= force->angle->datamask_read;
-      datamask_modify_device |= force->angle->datamask_modify;
     }
   }
   if (atomKK->molecular && force->dihedral) {
     if (force->dihedral->execution_space==Host) {
       execute_on_host  = true;
       datamask_read_host   |= force->dihedral->datamask_read;
-      datamask_modify_device |= force->dihedral->datamask_modify;
-    } else {
-      datamask_read_device   |= force->dihedral->datamask_read;
-      datamask_modify_device |= force->dihedral->datamask_modify;
     }
   }
   if (atomKK->molecular && force->improper) {
     if (force->improper->execution_space==Host) {
       execute_on_host  = true;
       datamask_read_host   |= force->improper->datamask_read;
-      datamask_modify_device |= force->improper->datamask_modify;
-    } else {
-      datamask_read_device   |= force->improper->datamask_read;
-      datamask_modify_device |= force->improper->datamask_modify;
     }
   }
   if (kspace_compute_flag) {
     if (force->kspace->execution_space==Host) {
       execute_on_host  = true;
       datamask_read_host   |= force->kspace->datamask_read;
-      datamask_modify_device |= force->kspace->datamask_modify;
-    } else {
-      datamask_read_device   |= force->kspace->datamask_read;
-      datamask_modify_device |= force->kspace->datamask_modify;
     }
   }
 
-
   if (pair_compute_flag) {
     atomKK->sync(force->pair->execution_space,force->pair->datamask_read);
     atomKK->sync(force->pair->execution_space,~(~force->pair->datamask_read|(F_MASK | ENERGY_MASK | VIRIAL_MASK)));
diff --git a/src/KOKKOS/transpose_helper_kokkos.h b/src/KOKKOS/transpose_helper_kokkos.h
index e3a4d86f9a..06af0aea91 100644
--- a/src/KOKKOS/transpose_helper_kokkos.h
+++ b/src/KOKKOS/transpose_helper_kokkos.h
@@ -125,8 +125,7 @@ struct TransposeHelperKokkos {
     elem[0] = extent_tile_id[0] * tile_size;
     elem[1] = extent_tile_id[1] * tile_size;
 
-    if (elem[0] >= d_dst.extent(0) ||
-      elem[1] >= d_dst.extent(1)) return;
+    if ((elem[0] >= (int)d_dst.extent(0)) || (elem[1] >= (int)d_dst.extent(1))) return;
 
     // determine if a row/column is a full `tile_size` in size or not
     bool perfect_pad[2];
@@ -135,35 +134,30 @@ struct TransposeHelperKokkos {
 
     // load phase
     if (src_is_layout_right) {
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, tile_size),
-        [&] (const int j) {
-
-        if (elem[1] + j < d_src.extent(1)) {
-          if (perfect_pad[0]) {
-            for (int i = 0; i < tile_size; i++)
-              buffer[i * (tile_size + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j);
-          } else {
-            for (int i = 0; i < (d_src.extent(0) - elem[0]); i++)
-              buffer[i * (tile_size + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j);
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, tile_size), [&] (const int j) {
+          if (elem[1] + j < (int)d_src.extent(1)) {
+            if (perfect_pad[0]) {
+              for (int i = 0; i < tile_size; i++)
+                buffer[i * (tile_size + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j);
+            } else {
+              for (int i = 0; i < ((int)d_src.extent(0) - elem[0]); i++)
+                buffer[i * (tile_size + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j);
+            }
           }
-        }
-      });
-
+        });
     } else {
       // src is layout left
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, tile_size),
-        [&] (const int i) {
-
-        if (elem[0] + i < d_src.extent(0)) {
-          if (perfect_pad[1]) {
-            for (int j = 0; j < tile_size; j++)
-              buffer[i * (tile_size + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j);
-          } else {
-            for (int j = 0; j < (d_src.extent(1) - elem[1]); j++)
-              buffer[i * (tile_size + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j);
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, tile_size), [&] (const int i) {
+          if (elem[0] + i < (int)d_src.extent(0)) {
+            if (perfect_pad[1]) {
+              for (int j = 0; j < tile_size; j++)
+                buffer[i * (tile_size + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j);
+            } else {
+              for (int j = 0; j < ((int)d_src.extent(1) - elem[1]); j++)
+                buffer[i * (tile_size + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j);
+            }
           }
-        }
-      });
+        });
     }
 
     // No need for an extra sync b/c there is an implicit sync at the end
@@ -171,37 +165,31 @@ struct TransposeHelperKokkos {
 
     // save phase
     if (src_is_layout_right) {
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, tile_size),
-        [&] (const int i) {
-
-        if (elem[0] + i < d_dst.extent(0)) {
-          if (perfect_pad[1]) {
-            for (int j = 0; j < tile_size; j++)
-              d_dst(elem[0] + i, elem[1] + j) = buffer[i * (tile_size + bank_pad) + j];
-          } else {
-            for (int j = 0; j < (d_dst.extent(1) - elem[1]); j++)
-              d_dst(elem[0] + i, elem[1] + j) = buffer[i * (tile_size + bank_pad) + j];
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, tile_size), [&] (const int i) {
+          if (elem[0] + i < (int)d_dst.extent(0)) {
+            if (perfect_pad[1]) {
+              for (int j = 0; j < tile_size; j++)
+                d_dst(elem[0] + i, elem[1] + j) = buffer[i * (tile_size + bank_pad) + j];
+            } else {
+              for (int j = 0; j < ((int)d_dst.extent(1) - elem[1]); j++)
+                d_dst(elem[0] + i, elem[1] + j) = buffer[i * (tile_size + bank_pad) + j];
+            }
           }
-        }
-      });
+        });
     } else {
-
       // src is layout left
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, tile_size),
-        [&] (const int j) {
-
-        if (elem[1] + j < d_dst.extent(1)) {
-          if (perfect_pad[0]) {
-            for (int i = 0; i < tile_size; i++)
-              d_dst(elem[0] + i, elem[1] + j) = buffer[i * (tile_size + bank_pad) + j];
-          } else {
-            for (int i = 0; i < (d_dst.extent(0) - elem[0]); i++)
-              d_dst(elem[0] + i, elem[1] + j) = buffer[i * (tile_size + bank_pad) + j];
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, tile_size), [&] (const int j) {
+          if (elem[1] + j < (int)d_dst.extent(1)) {
+            if (perfect_pad[0]) {
+              for (int i = 0; i < tile_size; i++)
+                d_dst(elem[0] + i, elem[1] + j) = buffer[i * (tile_size + bank_pad) + j];
+            } else {
+              for (int i = 0; i < ((int)d_dst.extent(0) - elem[0]); i++)
+                d_dst(elem[0] + i, elem[1] + j) = buffer[i * (tile_size + bank_pad) + j];
+            }
           }
-        }
-      });
+        });
     }
-
   }
 };
 
diff --git a/src/KSPACE/pair_lj_charmmfsw_coul_long.cpp b/src/KSPACE/pair_lj_charmmfsw_coul_long.cpp
index b7635c49c7..260c26e8aa 100644
--- a/src/KSPACE/pair_lj_charmmfsw_coul_long.cpp
+++ b/src/KSPACE/pair_lj_charmmfsw_coul_long.cpp
@@ -76,6 +76,8 @@ PairLJCharmmfswCoulLong::PairLJCharmmfswCoulLong(LAMMPS *lmp) : Pair(lmp)
 
 PairLJCharmmfswCoulLong::~PairLJCharmmfswCoulLong()
 {
+  if (copymode) return;
+
   // switch qqr2e back from CHARMM value to LAMMPS value
 
   if (update && strcmp(update->unit_style,"real") == 0) {
@@ -85,8 +87,6 @@ PairLJCharmmfswCoulLong::~PairLJCharmmfswCoulLong()
     force->qqr2e = force->qqr2e_lammps_real;
   }
 
-  if (copymode) return;
-
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(cutsq);
diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp
index 2f5b4fc670..ac516ff18c 100644
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@@ -1188,7 +1188,7 @@ double PPPM::compute_qopt()
   // each proc calculates contributions from every Pth grid point
 
   bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
-  int nxy_pppm = nx_pppm * ny_pppm;
+  bigint nxy_pppm = (bigint) nx_pppm * ny_pppm;
 
   double qopt = 0.0;
 
@@ -1944,7 +1944,8 @@ void PPPM::poisson_ik()
 
   // global energy and virial contribution
 
-  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  double scaleinv = 1.0/ngridtotal;
   double s2 = scaleinv*scaleinv;
 
   if (eflag_global || vflag_global) {
@@ -2145,7 +2146,8 @@ void PPPM::poisson_ad()
 
   // global energy and virial contribution
 
-  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  double scaleinv = 1.0/ngridtotal;
   double s2 = scaleinv*scaleinv;
 
   if (eflag_global || vflag_global) {
@@ -3259,7 +3261,8 @@ void PPPM::poisson_groups(int AA_flag)
   //  keep everything in reciprocal space so
   //  no inverse FFTs needed
 
-  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  double scaleinv = 1.0/ngridtotal;
   double s2 = scaleinv*scaleinv;
 
   // energy
diff --git a/src/KSPACE/pppm_dipole.cpp b/src/KSPACE/pppm_dipole.cpp
index a01ffea1dc..e0d13f2b9a 100644
--- a/src/KSPACE/pppm_dipole.cpp
+++ b/src/KSPACE/pppm_dipole.cpp
@@ -1338,7 +1338,8 @@ void PPPMDipole::poisson_ik_dipole()
 
   // global energy and virial contribution
 
-  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  double scaleinv = 1.0/ngridtotal;
   double s2 = scaleinv*scaleinv;
 
   if (eflag_global || vflag_global) {
diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp
index 72424a7330..a738db98d2 100644
--- a/src/KSPACE/pppm_disp.cpp
+++ b/src/KSPACE/pppm_disp.cpp
@@ -4556,7 +4556,8 @@ void PPPMDisp::poisson_ik(FFT_SCALAR* wk1, FFT_SCALAR* wk2,
 
   // if requested, compute energy and virial contribution
 
-  double scaleinv = 1.0/(nx_p*ny_p*nz_p);
+  bigint ngridtotal = (bigint) nx_p * ny_p * nz_p;
+  double scaleinv = 1.0/ngridtotal;
   double s2 = scaleinv*scaleinv;
 
   if (eflag_global || vflag_global) {
@@ -4696,7 +4697,8 @@ void PPPMDisp::poisson_ad(FFT_SCALAR* wk1, FFT_SCALAR* wk2,
 
   // if requested, compute energy and virial contribution
 
-  double scaleinv = 1.0/(nx_p*ny_p*nz_p);
+  bigint ngridtotal = (bigint) nx_p * ny_p * nz_p;
+  double scaleinv = 1.0/ngridtotal;
   double s2 = scaleinv*scaleinv;
 
   if (eflag_global || vflag_global) {
@@ -4844,7 +4846,8 @@ poisson_2s_ik(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
   int i,j,k,n;
   double eng;
 
-  double scaleinv = 1.0/(nx_pppm_6*ny_pppm_6*nz_pppm_6);
+  bigint ngridtotal = (bigint) nx_pppm_6 * ny_pppm_6 * nz_pppm_6;
+  double scaleinv = 1.0/ngridtotal;
 
   // transform charge/dispersion density (r -> k)
   // only one transform when energies and pressures not calculated
@@ -5017,7 +5020,8 @@ poisson_none_ik(int n1, int n2,FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
   int i,j,k,n;
   double eng;
 
-  double scaleinv = 1.0/(nx_pppm_6*ny_pppm_6*nz_pppm_6);
+  bigint ngridtotal = (bigint) nx_pppm_6 * ny_pppm_6 * nz_pppm_6;
+  double scaleinv = 1.0/ngridtotal;
 
   // transform charge/dispersion density (r -> k)
   // only one transform required when energies and pressures not needed
@@ -5191,7 +5195,8 @@ poisson_2s_ad(FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
   int i,j,k,n;
   double eng;
 
-  double scaleinv = 1.0/(nx_pppm_6*ny_pppm_6*nz_pppm_6);
+  bigint ngridtotal = (bigint) nx_pppm_6 * ny_pppm_6 * nz_pppm_6;
+  double scaleinv = 1.0/ngridtotal;
 
   // transform charge/dispersion density (r -> k)
   // only one tansform required when energies and pressures not needed
@@ -5289,7 +5294,8 @@ poisson_none_ad(int n1, int n2, FFT_SCALAR* dfft_1, FFT_SCALAR* dfft_2,
   int i,j,k,n;
   double eng;
 
-  double scaleinv = 1.0/(nx_pppm_6*ny_pppm_6*nz_pppm_6);
+  bigint ngridtotal = (bigint) nx_pppm_6 * ny_pppm_6 * nz_pppm_6;
+  double scaleinv = 1.0/ngridtotal;
 
   // transform charge/dispersion density (r -> k)
   // only one tansform required when energies and pressures not needed
diff --git a/src/KSPACE/pppm_stagger.cpp b/src/KSPACE/pppm_stagger.cpp
index d6f3c9cac6..d44f2428c8 100644
--- a/src/KSPACE/pppm_stagger.cpp
+++ b/src/KSPACE/pppm_stagger.cpp
@@ -302,7 +302,7 @@ double PPPMStagger::compute_qopt()
   // each proc calculates contributions from every Pth grid point
 
   bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
-  int nxy_pppm = nx_pppm * ny_pppm;
+  bigint nxy_pppm = (bigint) nx_pppm * ny_pppm;
 
   double qopt = 0.0;
 
@@ -398,7 +398,7 @@ double PPPMStagger::compute_qopt_ad()
   // each proc calculates contributions from every Pth grid point
 
   bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
-  int nxy_pppm = nx_pppm * ny_pppm;
+  bigint nxy_pppm = (bigint) nx_pppm * ny_pppm;
 
   double qopt = 0.0;
 
diff --git a/src/LATBOLTZ/fix_lb_fluid.cpp b/src/LATBOLTZ/fix_lb_fluid.cpp
index f692d28084..f3d8f45142 100644
--- a/src/LATBOLTZ/fix_lb_fluid.cpp
+++ b/src/LATBOLTZ/fix_lb_fluid.cpp
@@ -4430,9 +4430,9 @@ void FixLbFluid::calc_MPT(double &totalmass, double totalmomentum[3], double &Ta
 ------------------------------------------------------------------------- */
 /* ---------------------------------------------------------------------- */
 
-int FixLbFluid::adjust_dof_fix() /* Based on same private method in compute class */
-{                                /* altered to return fix_dof */
-  int fix_dof = 0;
+bigint FixLbFluid::adjust_dof_fix() /* Based on same private method in compute class */
+{                                   /* altered to return fix_dof */
+  bigint fix_dof = 0;
   for (auto &ifix : modify->get_fix_list())
     if (ifix->dof_flag) fix_dof += ifix->dof(igroup);
   return fix_dof;
diff --git a/src/LATBOLTZ/fix_lb_fluid.h b/src/LATBOLTZ/fix_lb_fluid.h
index 19cd2c6dc3..f134b50901 100644
--- a/src/LATBOLTZ/fix_lb_fluid.h
+++ b/src/LATBOLTZ/fix_lb_fluid.h
@@ -182,7 +182,7 @@ class FixLbFluid : public Fix {
   void calc_fluidforceII(void);
   void calc_fluidforceweight(void);
 
-  int adjust_dof_fix();
+  bigint adjust_dof_fix();
   double dof_compute();
 
   /* nanopit parameters */
diff --git a/src/LEPTON/angle_lepton.cpp b/src/LEPTON/angle_lepton.cpp
index 59310f5637..9fe565f8ee 100644
--- a/src/LEPTON/angle_lepton.cpp
+++ b/src/LEPTON/angle_lepton.cpp
@@ -44,6 +44,7 @@ AngleLepton::AngleLepton(LAMMPS *_lmp) :
 {
   writedata = 1;
   reinitflag = 0;
+  auto_offset = 1;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -90,10 +91,21 @@ template <int EVFLAG, int EFLAG, int NEWTON_BOND> void AngleLepton::eval()
 {
   std::vector<Lepton::CompiledExpression> angleforce;
   std::vector<Lepton::CompiledExpression> anglepot;
-  for (const auto &expr : expressions) {
-    auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(expr, lmp));
-    angleforce.emplace_back(parsed.differentiate("theta").createCompiledExpression());
-    if (EFLAG) anglepot.emplace_back(parsed.createCompiledExpression());
+  std::vector<bool> has_ref;
+  try {
+    for (const auto &expr : expressions) {
+      auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(expr, lmp));
+      angleforce.emplace_back(parsed.differentiate("theta").createCompiledExpression());
+      has_ref.push_back(true);
+      try {
+        angleforce.back().getVariableReference("theta");
+      } catch (Lepton::Exception &) {
+        has_ref.back() = false;
+      }
+      if (EFLAG) anglepot.emplace_back(parsed.createCompiledExpression());
+    }
+  } catch (std::exception &e) {
+    error->all(FLERR, e.what());
   }
 
   const double *const *const x = atom->x;
@@ -142,8 +154,7 @@ template <int EVFLAG, int EFLAG, int NEWTON_BOND> void AngleLepton::eval()
 
     const double dtheta = acos(c) - theta0[type];
     const int idx = type2expression[type];
-    angleforce[idx].getVariableReference("theta") = dtheta;
-
+    if (has_ref[idx]) angleforce[idx].getVariableReference("theta") = dtheta;
     const double a = -angleforce[idx].evaluate() * s;
     const double a11 = a * c / rsq1;
     const double a12 = -a / (r1 * r2);
@@ -179,7 +190,11 @@ template <int EVFLAG, int EFLAG, int NEWTON_BOND> void AngleLepton::eval()
 
     double eangle = 0.0;
     if (EFLAG) {
-      anglepot[idx].getVariableReference("theta") = dtheta;
+      try {
+        anglepot[idx].getVariableReference("theta") = dtheta;
+      } catch (Lepton::Exception &) {
+        ;    // ignore -> constant force
+      }
       eangle = anglepot[idx].evaluate() - offset[type];
     }
     if (EVFLAG)
@@ -202,6 +217,24 @@ void AngleLepton::allocate()
   for (int i = 1; i < np1; i++) setflag[i] = 0;
 }
 
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void AngleLepton::settings(int narg, char **arg)
+{
+  auto_offset = 1;
+  if (narg > 0) {
+    if (strcmp(arg[0],"auto_offset") == 0) {
+      auto_offset = 1;
+    } else if (strcmp(arg[0],"no_offset") == 0) {
+      auto_offset = 0;
+    } else {
+      error->all(FLERR, "Unknown angle style lepton setting {}", arg[0]);
+    }
+  }
+}
+
 /* ----------------------------------------------------------------------
    set coeffs for one or more types
 ------------------------------------------------------------------------- */
@@ -224,9 +257,20 @@ void AngleLepton::coeff(int narg, char **arg)
     auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(exp_one, lmp));
     auto anglepot = parsed.createCompiledExpression();
     auto angleforce = parsed.differentiate("theta").createCompiledExpression();
-    anglepot.getVariableReference("theta") = 0.0;
-    angleforce.getVariableReference("theta") = 0.0;
-    offset_one = anglepot.evaluate();
+    try {
+      anglepot.getVariableReference("theta") = 0.0;
+    } catch (Lepton::Exception &) {
+      if (comm->me == 0)
+        error->warning(FLERR, "Lepton potential expression {} does not depend on 'theta'", exp_one);
+    }
+    try {
+      angleforce.getVariableReference("theta") = 0.0;
+    } catch (Lepton::Exception &) {
+      if (comm->me == 0)
+        error->warning(FLERR, "Force from Lepton expression {} does not depend on 'theta'",
+                       exp_one);
+    }
+    if (auto_offset) offset_one = anglepot.evaluate();
     angleforce.evaluate();
   } catch (std::exception &e) {
     error->all(FLERR, e.what());
@@ -284,6 +328,7 @@ void AngleLepton::write_restart(FILE *fp)
     fwrite(&n, sizeof(int), 1, fp);
     fwrite(exp.c_str(), sizeof(char), n, fp);
   }
+  fwrite(&auto_offset, sizeof(int), 1, fp);
 }
 
 /* ----------------------------------------------------------------------
@@ -323,6 +368,9 @@ void AngleLepton::read_restart(FILE *fp)
     expressions.emplace_back(buf);
   }
 
+  if (comm->me == 0) utils::sfread(FLERR, &auto_offset, sizeof(int), 1, fp, nullptr, error);
+  MPI_Bcast(&auto_offset, 1, MPI_INT, 0, world);
+
   delete[] buf;
 }
 
@@ -363,7 +411,11 @@ double AngleLepton::single(int type, int i1, int i2, int i3)
   const auto &expr = expressions[type2expression[type]];
   auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(expr, lmp));
   auto anglepot = parsed.createCompiledExpression();
-  anglepot.getVariableReference("theta") = dtheta;
+  try {
+    anglepot.getVariableReference("theta") = dtheta;
+  } catch (Lepton::Exception &) {
+    ;    // ignore -> constant potential
+  }
   return anglepot.evaluate() - offset[type];
 }
 
diff --git a/src/LEPTON/angle_lepton.h b/src/LEPTON/angle_lepton.h
index 67d2718fb6..4f0e5729ed 100644
--- a/src/LEPTON/angle_lepton.h
+++ b/src/LEPTON/angle_lepton.h
@@ -29,6 +29,7 @@ class AngleLepton : public Angle {
   AngleLepton(class LAMMPS *);
   ~AngleLepton() override;
   void compute(int, int) override;
+  void settings(int, char **) override;
   void coeff(int, char **) override;
   double equilibrium_angle(int) override;
   void write_restart(FILE *) override;
@@ -42,6 +43,7 @@ class AngleLepton : public Angle {
   double *theta0;
   int *type2expression;
   double *offset;
+  int auto_offset;
 
   virtual void allocate();
 
diff --git a/src/LEPTON/bond_lepton.cpp b/src/LEPTON/bond_lepton.cpp
index 773607782d..8679d0ed62 100644
--- a/src/LEPTON/bond_lepton.cpp
+++ b/src/LEPTON/bond_lepton.cpp
@@ -37,6 +37,7 @@ BondLepton::BondLepton(LAMMPS *_lmp) :
 {
   writedata = 1;
   reinitflag = 0;
+  auto_offset = 1;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -82,10 +83,17 @@ template <int EVFLAG, int EFLAG, int NEWTON_BOND> void BondLepton::eval()
 {
   std::vector<Lepton::CompiledExpression> bondforce;
   std::vector<Lepton::CompiledExpression> bondpot;
+  std::vector<bool> has_ref;
   try {
     for (const auto &expr : expressions) {
       auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(expr, lmp));
       bondforce.emplace_back(parsed.differentiate("r").createCompiledExpression());
+      has_ref.push_back(true);
+      try {
+        bondforce.back().getVariableReference("r");
+      } catch (Lepton::Exception &) {
+        has_ref.back() = false;
+      }
       if (EFLAG) bondpot.emplace_back(parsed.createCompiledExpression());
     }
   } catch (std::exception &e) {
@@ -116,7 +124,7 @@ template <int EVFLAG, int EFLAG, int NEWTON_BOND> void BondLepton::eval()
 
     double fbond = 0.0;
     if (r > 0.0) {
-      bondforce[idx].getVariableReference("r") = dr;
+      if (has_ref[idx]) bondforce[idx].getVariableReference("r") = dr;
       fbond = -bondforce[idx].evaluate() / r;
     }
 
@@ -136,7 +144,11 @@ template <int EVFLAG, int EFLAG, int NEWTON_BOND> void BondLepton::eval()
 
     double ebond = 0.0;
     if (EFLAG) {
-      bondpot[idx].getVariableReference("r") = dr;
+      try {
+        bondpot[idx].getVariableReference("r") = dr;
+      } catch (Lepton::Exception &) {
+        ;    // ignore -> constant potential
+      }
       ebond = bondpot[idx].evaluate() - offset[type];
     }
     if (EVFLAG) ev_tally(i1, i2, nlocal, NEWTON_BOND, ebond, fbond, delx, dely, delz);
@@ -157,6 +169,24 @@ void BondLepton::allocate()
   for (int i = 1; i < np1; i++) setflag[i] = 0;
 }
 
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void BondLepton::settings(int narg, char **arg)
+{
+  auto_offset = 1;
+  if (narg > 0) {
+    if (strcmp(arg[0],"auto_offset") == 0) {
+      auto_offset = 1;
+    } else if (strcmp(arg[0],"no_offset") == 0) {
+      auto_offset = 0;
+    } else {
+      error->all(FLERR, "Unknown bond style lepton setting {}", arg[0]);
+    }
+  }
+}
+
 /* ----------------------------------------------------------------------
    set coeffs for one or more types
 ------------------------------------------------------------------------- */
@@ -179,9 +209,19 @@ void BondLepton::coeff(int narg, char **arg)
     auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(exp_one, lmp));
     auto bondpot = parsed.createCompiledExpression();
     auto bondforce = parsed.differentiate("r").createCompiledExpression();
-    bondpot.getVariableReference("r") = 0.0;
-    bondforce.getVariableReference("r") = 0.0;
-    offset_one = bondpot.evaluate();
+    try {
+      bondpot.getVariableReference("r") = 0.0;
+    } catch (Lepton::Exception &e) {
+      if (comm->me == 0)
+        error->warning(FLERR, "Lepton potential expression {} does not depend on 'r'", exp_one);
+    }
+    try {
+      bondforce.getVariableReference("r") = 0.0;
+    } catch (Lepton::Exception &e) {
+      if (comm->me == 0)
+        error->warning(FLERR, "Force from Lepton expression {} does not depend on 'r'", exp_one);
+    }
+    if (auto_offset) offset_one = bondpot.evaluate();
     bondforce.evaluate();
   } catch (std::exception &e) {
     error->all(FLERR, e.what());
@@ -239,6 +279,7 @@ void BondLepton::write_restart(FILE *fp)
     fwrite(&n, sizeof(int), 1, fp);
     fwrite(exp.c_str(), sizeof(char), n, fp);
   }
+  fwrite(&auto_offset, sizeof(int), 1, fp);
 }
 
 /* ----------------------------------------------------------------------
@@ -278,6 +319,9 @@ void BondLepton::read_restart(FILE *fp)
     expressions.emplace_back(buf);
   }
 
+  if (comm->me == 0) utils::sfread(FLERR, &auto_offset, sizeof(int), 1, fp, nullptr, error);
+  MPI_Bcast(&auto_offset, 1, MPI_INT, 0, world);
+
   delete[] buf;
 }
 
@@ -302,8 +346,12 @@ double BondLepton::single(int type, double rsq, int /*i*/, int /*j*/, double &ff
   auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(expr, lmp));
   auto bondpot = parsed.createCompiledExpression();
   auto bondforce = parsed.differentiate("r").createCompiledExpression();
-  bondforce.getVariableReference("r") = dr;
-  bondpot.getVariableReference("r") = dr;
+  try {
+    bondpot.getVariableReference("r") = dr;
+    bondforce.getVariableReference("r") = dr;
+  } catch (Lepton::Exception &) {
+    ;    // ignore -> constant potential or force
+  }
 
   // force and energy
 
diff --git a/src/LEPTON/bond_lepton.h b/src/LEPTON/bond_lepton.h
index 9e693298a7..e59648a3f0 100644
--- a/src/LEPTON/bond_lepton.h
+++ b/src/LEPTON/bond_lepton.h
@@ -29,6 +29,7 @@ class BondLepton : public Bond {
   BondLepton(class LAMMPS *);
   ~BondLepton() override;
   void compute(int, int) override;
+  void settings(int, char **) override;
   void coeff(int, char **) override;
   double equilibrium_distance(int) override;
   void write_restart(FILE *) override;
@@ -42,6 +43,7 @@ class BondLepton : public Bond {
   double *r0;
   int *type2expression;
   double *offset;
+  int auto_offset;
 
   virtual void allocate();
 
diff --git a/src/LEPTON/dihedral_lepton.cpp b/src/LEPTON/dihedral_lepton.cpp
index 6470e43033..069ff13d74 100644
--- a/src/LEPTON/dihedral_lepton.cpp
+++ b/src/LEPTON/dihedral_lepton.cpp
@@ -92,10 +92,17 @@ template <int EVFLAG, int EFLAG, int NEWTON_BOND> void DihedralLepton::eval()
 {
   std::vector<Lepton::CompiledExpression> dihedralforce;
   std::vector<Lepton::CompiledExpression> dihedralpot;
+  std::vector<bool> has_ref;
   try {
     for (const auto &expr : expressions) {
       auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(expr, lmp));
       dihedralforce.emplace_back(parsed.differentiate("phi").createCompiledExpression());
+      has_ref.push_back(true);
+      try {
+        dihedralforce.back().getVariableReference("phi");
+      } catch (Lepton::Exception &) {
+        has_ref.back() = false;
+      }
       if (EFLAG) dihedralpot.emplace_back(parsed.createCompiledExpression());
     }
   } catch (std::exception &e) {
@@ -278,7 +285,7 @@ template <int EVFLAG, int EFLAG, int NEWTON_BOND> void DihedralLepton::eval()
     }
 
     const int idx = type2expression[type];
-    dihedralforce[idx].getVariableReference("phi") = phi;
+    if (has_ref[idx]) dihedralforce[idx].getVariableReference("phi") = phi;
     double m_du_dphi = -dihedralforce[idx].evaluate();
 
     // ----- Step 4: Calculate the force direction in real space -----
@@ -322,7 +329,11 @@ template <int EVFLAG, int EFLAG, int NEWTON_BOND> void DihedralLepton::eval()
 
     double edihedral = 0.0;
     if (EFLAG) {
-      dihedralpot[idx].getVariableReference("phi") = phi;
+      try {
+        dihedralpot[idx].getVariableReference("phi") = phi;
+      } catch (Lepton::Exception &) {
+        ;    // ignore -> constant potential
+      }
       edihedral = dihedralpot[idx].evaluate();
     }
     if (EVFLAG)
@@ -362,8 +373,18 @@ void DihedralLepton::coeff(int narg, char **arg)
     auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(exp_one, lmp));
     auto dihedralpot = parsed.createCompiledExpression();
     auto dihedralforce = parsed.differentiate("phi").createCompiledExpression();
-    dihedralpot.getVariableReference("phi") = 0.0;
-    dihedralforce.getVariableReference("phi") = 0.0;
+    try {
+      dihedralpot.getVariableReference("phi") = 0.0;
+    } catch (Lepton::Exception &) {
+      if (comm->me == 0)
+        error->warning(FLERR, "Lepton potential expression {} does not depend on 'phi'", exp_one);
+    }
+    try {
+      dihedralforce.getVariableReference("phi") = 0.0;
+    } catch (Lepton::Exception &) {
+      if (comm->me == 0)
+        error->warning(FLERR, "Force from Lepton expression {} does not depend on 'phi'", exp_one);
+    }
     dihedralforce.evaluate();
   } catch (std::exception &e) {
     error->all(FLERR, e.what());
diff --git a/src/LEPTON/fix_wall_lepton.cpp b/src/LEPTON/fix_wall_lepton.cpp
index a81d3c4edb..7530188c00 100644
--- a/src/LEPTON/fix_wall_lepton.cpp
+++ b/src/LEPTON/fix_wall_lepton.cpp
@@ -13,6 +13,7 @@
 
 #include "fix_wall_lepton.h"
 #include "atom.h"
+#include "comm.h"
 #include "error.h"
 
 #include "Lepton.h"
@@ -41,8 +42,18 @@ void FixWallLepton::post_constructor()
       auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(exp_one, lmp));
       auto wallpot = parsed.createCompiledExpression();
       auto wallforce = parsed.differentiate("r").createCompiledExpression();
-      wallpot.getVariableReference("r") = 0.0;
-      wallforce.getVariableReference("r") = 0.0;
+      try {
+        wallpot.getVariableReference("r") = 0.0;
+      } catch (Lepton::Exception &) {
+        if (comm->me == 0)
+          error->warning(FLERR, "Lepton potential expression {} does not depend on 'r'", exp_one);
+      }
+      try {
+        wallforce.getVariableReference("r") = 0.0;
+      } catch (Lepton::Exception &) {
+        if (comm->me == 0)
+          error->warning(FLERR, "Force from Lepton expression {} does not depend on 'r'", exp_one);
+      }
       wallpot.evaluate();
       wallforce.evaluate();
     } catch (std::exception &e) {
diff --git a/src/LEPTON/pair_lepton.cpp b/src/LEPTON/pair_lepton.cpp
index a8af0ce576..adc07cbfa8 100644
--- a/src/LEPTON/pair_lepton.cpp
+++ b/src/LEPTON/pair_lepton.cpp
@@ -27,6 +27,7 @@
 
 #include "Lepton.h"
 #include "lepton_utils.h"
+#include <array>
 #include <cmath>
 #include <map>
 
@@ -105,11 +106,17 @@ template <int EVFLAG, int EFLAG, int NEWTON_PAIR> void PairLepton::eval()
 
   std::vector<Lepton::CompiledExpression> pairforce;
   std::vector<Lepton::CompiledExpression> pairpot;
+  std::vector<bool> has_ref;
   try {
     for (const auto &expr : expressions) {
       auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(expr, lmp), functions);
       pairforce.emplace_back(parsed.differentiate("r").createCompiledExpression());
-      pairforce.back().getVariableReference("r");
+      has_ref.push_back(true);
+      try {
+        pairforce.back().getVariableReference("r");
+      } catch (Lepton::Exception &) {
+        has_ref.back() = false;
+      }
       if (EFLAG) pairpot.emplace_back(parsed.createCompiledExpression());
     }
   } catch (std::exception &e) {
@@ -142,8 +149,7 @@ template <int EVFLAG, int EFLAG, int NEWTON_PAIR> void PairLepton::eval()
       if (rsq < cutsq[itype][jtype]) {
         const double r = sqrt(rsq);
         const int idx = type2expression[itype][jtype];
-        double &r_for = pairforce[idx].getVariableReference("r");
-        r_for = r;
+        if (has_ref[idx]) pairforce[idx].getVariableReference("r") = r;
         const double fpair = -pairforce[idx].evaluate() / r * factor_lj;
 
         fxtmp += delx * fpair;
@@ -157,7 +163,11 @@ template <int EVFLAG, int EFLAG, int NEWTON_PAIR> void PairLepton::eval()
 
         double evdwl = 0.0;
         if (EFLAG) {
-          pairpot[idx].getVariableReference("r") = r;
+          try {
+            pairpot[idx].getVariableReference("r") = r;
+          } catch (Lepton::Exception &) {
+            ;    // ignore -> constant potential
+          }
           evdwl = pairpot[idx].evaluate() - offset[itype][jtype];
           evdwl *= factor_lj;
         }
@@ -229,8 +239,12 @@ void PairLepton::coeff(int narg, char **arg)
     auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(exp_one, lmp), functions);
     auto pairforce = parsed.differentiate("r").createCompiledExpression();
     auto pairpot = parsed.createCompiledExpression();
-    pairpot.getVariableReference("r") = 1.0;
-    pairforce.getVariableReference("r") = 1.0;
+    try {
+      pairpot.getVariableReference("r") = 1.0;
+      pairforce.getVariableReference("r") = 1.0;
+    } catch (Lepton::Exception &) {
+      ;    // ignore -> constant potential or force
+    }
     pairpot.evaluate();
     pairforce.evaluate();
   } catch (std::exception &e) {
@@ -270,7 +284,11 @@ double PairLepton::init_one(int i, int j)
     try {
       auto expr = LeptonUtils::substitute(expressions[type2expression[i][j]], lmp);
       auto pairpot = Lepton::Parser::parse(expr, functions).createCompiledExpression();
-      pairpot.getVariableReference("r") = cut[i][j];
+      try {
+        pairpot.getVariableReference("r") = cut[i][j];
+      } catch (Lepton::Exception &) {
+        ;    // ignore -> constant potential
+      }
       offset[i][j] = pairpot.evaluate();
     } catch (std::exception &) {
     }
@@ -429,9 +447,12 @@ double PairLepton::single(int /* i */, int /* j */, int itype, int jtype, double
   auto pairforce = parsed.differentiate("r").createCompiledExpression();
 
   const double r = sqrt(rsq);
-  pairpot.getVariableReference("r") = r;
-  pairforce.getVariableReference("r") = r;
-
+  try {
+    pairpot.getVariableReference("r") = r;
+    pairforce.getVariableReference("r") = r;
+  } catch (Lepton::Exception &) {
+    ;    // ignore -> constant potential or force
+  }
   fforce = -pairforce.evaluate() / r * factor_lj;
   return (pairpot.evaluate() - offset[itype][jtype]) * factor_lj;
 }
diff --git a/src/LEPTON/pair_lepton_coul.cpp b/src/LEPTON/pair_lepton_coul.cpp
index 841565e874..f7d2042874 100644
--- a/src/LEPTON/pair_lepton_coul.cpp
+++ b/src/LEPTON/pair_lepton_coul.cpp
@@ -28,6 +28,8 @@
 
 #include "Lepton.h"
 #include "lepton_utils.h"
+
+#include <array>
 #include <cmath>
 
 using namespace LAMMPS_NS;
@@ -79,25 +81,30 @@ template <int EVFLAG, int EFLAG, int NEWTON_PAIR> void PairLeptonCoul::eval()
 
   std::vector<Lepton::CompiledExpression> pairforce;
   std::vector<Lepton::CompiledExpression> pairpot;
-  std::vector<std::pair<bool, bool>> have_q;
+  std::vector<std::array<bool, 3>> has_ref;
   try {
     for (const auto &expr : expressions) {
       auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(expr, lmp), functions);
       pairforce.emplace_back(parsed.differentiate("r").createCompiledExpression());
+      has_ref.push_back({true, true, true});
+      try {
+        pairforce.back().getVariableReference("r");
+      } catch (Lepton::Exception &) {
+        has_ref.back()[0] = false;
+      }
       if (EFLAG) pairpot.emplace_back(parsed.createCompiledExpression());
-      pairforce.back().getVariableReference("r");
-      have_q.emplace_back(true, true);
 
       // check if there are references to charges
+
       try {
         pairforce.back().getVariableReference("qi");
-      } catch (std::exception &) {
-        have_q.back().first = false;
+      } catch (Lepton::Exception &) {
+        has_ref.back()[1] = false;
       }
       try {
         pairforce.back().getVariableReference("qj");
-      } catch (std::exception &) {
-        have_q.back().second = false;
+      } catch (Lepton::Exception &) {
+        has_ref.back()[2] = false;
       }
     }
   } catch (std::exception &e) {
@@ -130,9 +137,9 @@ template <int EVFLAG, int EFLAG, int NEWTON_PAIR> void PairLeptonCoul::eval()
       if (rsq < cutsq[itype][jtype]) {
         const double r = sqrt(rsq);
         const int idx = type2expression[itype][jtype];
-        pairforce[idx].getVariableReference("r") = r;
-        if (have_q[idx].first) pairforce[idx].getVariableReference("qi") = q2e * q[i];
-        if (have_q[idx].second) pairforce[idx].getVariableReference("qj") = q2e * q[j];
+        if (has_ref[idx][0]) pairforce[idx].getVariableReference("r") = r;
+        if (has_ref[idx][1]) pairforce[idx].getVariableReference("qi") = q2e * q[i];
+        if (has_ref[idx][2]) pairforce[idx].getVariableReference("qj") = q2e * q[j];
         const double fpair = -pairforce[idx].evaluate() / r * factor_coul;
 
         fxtmp += delx * fpair;
@@ -146,9 +153,14 @@ template <int EVFLAG, int EFLAG, int NEWTON_PAIR> void PairLeptonCoul::eval()
 
         double ecoul = 0.0;
         if (EFLAG) {
-          pairpot[idx].getVariableReference("r") = r;
-          if (have_q[idx].first) pairpot[idx].getVariableReference("qi") = q2e * q[i];
-          if (have_q[idx].second) pairpot[idx].getVariableReference("qj") = q2e * q[j];
+          try {
+            pairpot[idx].getVariableReference("r") = r;
+          } catch (Lepton::Exception &) {
+            ;    // ignore -> constant potential
+          }
+          if (has_ref[idx][1]) pairpot[idx].getVariableReference("qi") = q2e * q[i];
+          if (has_ref[idx][2]) pairpot[idx].getVariableReference("qj") = q2e * q[j];
+
           ecoul = pairpot[idx].evaluate();
           ecoul *= factor_coul;
         }
@@ -249,18 +261,22 @@ double PairLeptonCoul::single(int i, int j, int itype, int jtype, double rsq, do
 
   const double r = sqrt(rsq);
   const double q2e = sqrt(force->qqrd2e);
-  pairpot.getVariableReference("r") = r;
-  pairforce.getVariableReference("r") = r;
+  try {
+    pairpot.getVariableReference("r") = r;
+    pairforce.getVariableReference("r") = r;
+  } catch (Lepton::Exception &) {
+    ;    // ignore -> constant potential or force
+  }
   try {
     pairpot.getVariableReference("qi") = q2e * atom->q[i];
     pairforce.getVariableReference("qi") = q2e * atom->q[i];
-  } catch (std::exception &) {
+  } catch (Lepton::Exception &) {
     /* ignore */
   }
   try {
     pairpot.getVariableReference("qj") = q2e * atom->q[j];
     pairforce.getVariableReference("qj") = q2e * atom->q[j];
-  } catch (std::exception &) {
+  } catch (Lepton::Exception &) {
     /* ignore */
   }
 
diff --git a/src/LEPTON/pair_lepton_sphere.cpp b/src/LEPTON/pair_lepton_sphere.cpp
index 29514aed38..72d0e85d0b 100644
--- a/src/LEPTON/pair_lepton_sphere.cpp
+++ b/src/LEPTON/pair_lepton_sphere.cpp
@@ -28,6 +28,7 @@
 
 #include "Lepton.h"
 #include "lepton_utils.h"
+#include <array>
 #include <cmath>
 
 using namespace LAMMPS_NS;
@@ -77,25 +78,30 @@ template <int EVFLAG, int EFLAG, int NEWTON_PAIR> void PairLeptonSphere::eval()
 
   std::vector<Lepton::CompiledExpression> pairforce;
   std::vector<Lepton::CompiledExpression> pairpot;
-  std::vector<std::pair<bool, bool>> have_rad;
+  std::vector<std::array<bool, 3>> has_ref;
   try {
     for (const auto &expr : expressions) {
       auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(expr, lmp), functions);
       pairforce.emplace_back(parsed.differentiate("r").createCompiledExpression());
+      has_ref.push_back({true, true, true});
+      try {
+        pairforce.back().getVariableReference("r");
+      } catch (Lepton::Exception &) {
+        has_ref.back()[0] = false;
+      }
       if (EFLAG) pairpot.emplace_back(parsed.createCompiledExpression());
-      pairforce.back().getVariableReference("r");
-      have_rad.emplace_back(true, true);
 
-      // check if there are references to charges
+      // check if there are references to radii
+
       try {
         pairforce.back().getVariableReference("radi");
-      } catch (std::exception &) {
-        have_rad.back().first = false;
+      } catch (Lepton::Exception &) {
+        has_ref.back()[1] = false;
       }
       try {
         pairforce.back().getVariableReference("radj");
-      } catch (std::exception &) {
-        have_rad.back().second = false;
+      } catch (Lepton::Exception &) {
+        has_ref.back()[2] = false;
       }
     }
   } catch (std::exception &e) {
@@ -128,9 +134,9 @@ template <int EVFLAG, int EFLAG, int NEWTON_PAIR> void PairLeptonSphere::eval()
       if (rsq < cutsq[itype][jtype]) {
         const double r = sqrt(rsq);
         const int idx = type2expression[itype][jtype];
-        pairforce[idx].getVariableReference("r") = r;
-        if (have_rad[idx].first) pairforce[idx].getVariableReference("radi") = radius[i];
-        if (have_rad[idx].second) pairforce[idx].getVariableReference("radj") = radius[j];
+        if (has_ref[idx][0]) pairforce[idx].getVariableReference("r") = r;
+        if (has_ref[idx][1]) pairforce[idx].getVariableReference("radi") = radius[i];
+        if (has_ref[idx][2]) pairforce[idx].getVariableReference("radj") = radius[j];
         const double fpair = -pairforce[idx].evaluate() / r * factor_lj;
 
         fxtmp += delx * fpair;
@@ -144,9 +150,14 @@ template <int EVFLAG, int EFLAG, int NEWTON_PAIR> void PairLeptonSphere::eval()
 
         double evdwl = 0.0;
         if (EFLAG) {
-          pairpot[idx].getVariableReference("r") = r;
-          if (have_rad[idx].first) pairpot[idx].getVariableReference("radi") = radius[i];
-          if (have_rad[idx].second) pairpot[idx].getVariableReference("radj") = radius[j];
+          try {
+            pairpot[idx].getVariableReference("r") = r;
+          } catch (Lepton::Exception &) {
+            ;    // ignore -> constant potential
+          }
+          if (has_ref[idx][1]) pairpot[idx].getVariableReference("radi") = radius[i];
+          if (has_ref[idx][2]) pairpot[idx].getVariableReference("radj") = radius[j];
+
           evdwl = pairpot[idx].evaluate();
           evdwl *= factor_lj;
         }
@@ -211,19 +222,23 @@ double PairLeptonSphere::single(int i, int j, int itype, int jtype, double rsq,
   auto pairforce = parsed.differentiate("r").createCompiledExpression();
 
   const double r = sqrt(rsq);
-  pairpot.getVariableReference("r") = r;
-  pairforce.getVariableReference("r") = r;
+  try {
+    pairpot.getVariableReference("r") = r;
+    pairforce.getVariableReference("r") = r;
+  } catch (Lepton::Exception &) {
+    ;    // ignore -> constant potential or force
+  }
   try {
     pairpot.getVariableReference("radi") = atom->radius[i];
     pairforce.getVariableReference("radi") = atom->radius[i];
-  } catch (std::exception &) {
-    /* ignore */
+  } catch (Lepton::Exception &) {
+    ;    // ignore
   }
   try {
     pairpot.getVariableReference("radj") = atom->radius[j];
     pairforce.getVariableReference("radj") = atom->radius[j];
-  } catch (std::exception &) {
-    /* ignore */
+  } catch (Lepton::Exception &) {
+    ;    // ignore
   }
 
   fforce = -pairforce.evaluate() / r * factor_lj;
diff --git a/src/MAKE/MACHINES/Makefile.bgq b/src/MAKE/MACHINES/Makefile.bgq
deleted file mode 100644
index 4baecb9fc3..0000000000
--- a/src/MAKE/MACHINES/Makefile.bgq
+++ /dev/null
@@ -1,60 +0,0 @@
-# bgq = IBM Blue Gene/Q, multiple compiler options, native MPI, ALCF FFTW2
-
-SHELL = /bin/bash
-.SUFFIXES: .cpp .u 
-
-# ---------------------------------------------------------------------
-# build rules and dependencies
-# do not edit this section
-# select which compiler by editing Makefile.bgq.details
-
-include ../MAKE/MACHINES/bgq.make.details
-
-include Makefile.package.settings
-include Makefile.package
-
-EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
-EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
-EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB) $(DYN_LIB)
-EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
-EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
-
-# Path to src files
-
-vpath %.cpp ..
-vpath %.h ..
-
-# Link target
-
-$(EXE): main.o $(LMPLIB) $(EXTRA_LINK_DEPENDS)
-	$(LINK) $(LINKFLAGS) main.o $(EXTRA_PATH) $(LMPLINK) $(EXTRA_LIB) $(LIB) -o $@
-	$(SIZE) $@
-
-# Library targets
-
-$(ARLIB): $(OBJ) $(EXTRA_LINK_DEPENDS)
-	@rm -f ../$(ARLIB)
-	$(ARCHIVE) $(ARFLAGS) ../$(ARLIB) $(OBJ)
-	@rm -f $(ARLIB)
-	@ln -s ../$(ARLIB) $(ARLIB)
-
-$(SHLIB): $(OBJ) $(EXTRA_LINK_DEPENDS)
-	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o ../$(SHLIB) \
-		$(OBJ) $(EXTRA_LIB) $(LIB)
-	@rm -f $(SHLIB)
-	@ln -s ../$(SHLIB) $(SHLIB)
-
-# Compilation rules
-
-%.o:%.cpp
-	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
-
-# Individual dependencies
-
-depend : fastdep.exe $(SRC)
-	@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
-
-fastdep.exe: ../DEPEND/fastdep.c
-	cc -O -o $@ $<
-
-sinclude .depend
diff --git a/src/MAKE/MACHINES/Makefile.xe6 b/src/MAKE/MACHINES/Makefile.xe6
deleted file mode 100644
index 9dbe0ba73e..0000000000
--- a/src/MAKE/MACHINES/Makefile.xe6
+++ /dev/null
@@ -1,125 +0,0 @@
-# xe6 = Cray XE6, Cray CC, native MPI, FFTW
-
-SHELL = /bin/sh
-.SUFFIXES: .cpp .d
-
-# ---------------------------------------------------------------------
-# compiler/linker settings
-# specify flags and libraries needed for your compiler
-
-CC =		CC
-CCFLAGS =	-fastsse
-SHFLAGS =	-fPIC
-DEPFLAGS =	-M
-
-LINK =		CC
-LINKFLAGS =	-O
-LIB =           -lstdc++
-SIZE =		size
-
-ARCHIVE =	ar
-ARFLAGS =	-rc
-SHLIBFLAGS =	-shared
-
-# ---------------------------------------------------------------------
-# LAMMPS-specific settings, all OPTIONAL
-# specify settings for LAMMPS features you will use
-# if you change any -D setting, do full re-compile after "make clean"
-
-# LAMMPS ifdef settings
-# see possible settings in Section 3.5 of the manual
-
-LMP_INC =	-DLAMMPS_GZIP
-
-# MPI library
-# see discussion in Section 3.4 of the manual
-# MPI wrapper compiler/linker can provide this info
-# can point to dummy MPI library in src/STUBS as in Makefile.serial
-# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
-# INC = path for mpi.h, MPI compiler settings
-# PATH = path for MPI library
-# LIB = name of MPI library
-
-MPI_INC =       -DMPICH_SKIP_MPICXX 
-MPI_PATH = 
-MPI_LIB =	
-
-# FFT library
-# see discussion in Section 3.5.2 of manual
-# can be left blank to use provided KISS FFT library
-# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
-# PATH = path for FFT library
-# LIB = name of FFT library
-
-FFT_INC =
-FFT_PATH =
-FFT_LIB =
-
-# JPEG and/or PNG library
-# see discussion in Section 3.5.4 of manual
-# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
-# INC = path(s) for jpeglib.h and/or png.h
-# PATH = path(s) for JPEG library and/or PNG library
-# LIB = name(s) of JPEG library and/or PNG library
-
-JPG_INC =
-JPG_PATH =
-JPG_LIB =
-
-#  library for loading shared objects (defaults to -ldl, should be empty on Windows)
-# uncomment to change the default
-
-# override DYN_LIB =
-
-# ---------------------------------------------------------------------
-# build rules and dependencies
-# do not edit this section
-
-include Makefile.package.settings
-include Makefile.package
-
-EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
-EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
-EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB) $(DYN_LIB)
-EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
-EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
-
-# Path to src files
-
-vpath %.cpp ..
-vpath %.h ..
-
-# Link target
-
-$(EXE): main.o $(LMPLIB) $(EXTRA_LINK_DEPENDS)
-	$(LINK) $(LINKFLAGS) main.o $(EXTRA_PATH) $(LMPLINK) $(EXTRA_LIB) $(LIB) -o $@
-	$(SIZE) $@
-
-# Library targets
-
-$(ARLIB): $(OBJ) $(EXTRA_LINK_DEPENDS)
-	@rm -f ../$(ARLIB)
-	$(ARCHIVE) $(ARFLAGS) ../$(ARLIB) $(OBJ)
-	@rm -f $(ARLIB)
-	@ln -s ../$(ARLIB) $(ARLIB)
-
-$(SHLIB): $(OBJ) $(EXTRA_LINK_DEPENDS)
-	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o ../$(SHLIB) \
-		$(OBJ) $(EXTRA_LIB) $(LIB)
-	@rm -f $(SHLIB)
-	@ln -s ../$(SHLIB) $(SHLIB)
-
-# Compilation rules
-
-%.o:%.cpp
-	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
-
-# Individual dependencies
-
-depend : fastdep.exe $(SRC)
-	@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
-
-fastdep.exe: ../DEPEND/fastdep.c
-	cc -O -o $@ $<
-
-sinclude .depend
diff --git a/src/MAKE/MACHINES/bgq.make.details b/src/MAKE/MACHINES/bgq.make.details
deleted file mode 100644
index 0febd69d26..0000000000
--- a/src/MAKE/MACHINES/bgq.make.details
+++ /dev/null
@@ -1,125 +0,0 @@
-# multiple compiler options for BGQ
-
-# ---------------------------------------------------------------------
-# compiler/linker settings
-# specify flags and libraries needed for your compiler
-
-# uncomment one and only one of the following three lines 
-# to choose a compiler toolchain
-
-#COMPILER = GCC
-#COMPILER = LLVM
-COMPILER = XLC
-
-ifeq ($(COMPILER),XLC)
-CC       = /bgsys/drivers/ppcfloor/comm/xl.ndebug/bin/mpixlcxx_r
-CCFLAGS  = -O3 -qarch=qp -qtune=qp -qsmp=omp -qsimd=auto -qhot=level=2 -qprefetch -qunroll=yes
-FC       = /bgsys/drivers/ppcfloor/comm/xl.ndebug/bin/mpixlf90_r
-FFLAGS   = -O3 -qarch=qp -qtune=qp -qsimd=auto -qhot=level=2 -qprefetch -qunroll=yes -qsmp=omp -qextname -qnosave
-DEPFLAGS = -M -qmakedep=gcc
-endif
-
-ifeq ($(COMPILER),GCC)
-CC       = /bgsys/drivers/ppcfloor/comm/gcc.legacy/bin/mpicxx
-CCFLAGS  = -O3 -fopenmp
-FC       = /bgsys/drivers/ppcfloor/comm/gcc.legacy/bin/mpif90
-FFLAGS   = -O3 -fopenmp
-DEPFLAGS = -M
-endif
-
-ifeq ($(COMPILER),LLVM)
-#CC       = bgclang++
-CC       = /home/projects/llvm/mpi/bgclang/bin/mpiclang++
-CCFLAGS  = -O3 -fopenmp
-DEPFLAGS = -M
-FC	 = /bin/false
-FFLAGS   = LLVM does not have a Fortran front-end!
-endif
-
-LINK      = $(CC)
-LINKFLAGS = $(CCFLAGS)
-
-ifeq ($(COMPILER),XLC)
-  MASS_LIB    = ${IBM_MAIN_DIR}/xlmass/bg/7.3/bglib64 
-  XLF_LIB     = ${IBM_MAIN_DIR}/xlf/bg/14.1/bglib64
-  XLSMP_LIB   = ${IBM_MAIN_DIR}/xlsmp/bg/3.1/bglib64
-  LIB        += -L${MASS_LIB} -L${XLF_LIB} -L${XLSMP_LIB} 
-  LIB        += -lmassv -lmass 
-  LIB        += -lxlf90_r -lxlsmp -lxlopt -lxlfmath -lxl
-endif
-
-ifeq ($(COMPILER),GCC)
-# libm is definitely slower than libmass...
-  LIB += -lm -lgfortran
-endif
-
-ifeq ($(COMPILER),LLVM)
-    SLEEF_DIR = /home/projects/llvm/sleef
-    LIB += -L${SLEEF_DIR}/lib -lsleef
-endif
-
-SIZE       = size
-
-ARCHIVE    = ar
-ARFLAGS    = -rc
-
-# BGQ should not use shared libraries
-
-SHFLAGS    =
-SHLIBFLAGS = 
-
-# ---------------------------------------------------------------------
-# LAMMPS-specific settings, all OPTIONAL
-# specify settings for LAMMPS features you will use
-# if you change any -D setting, do full re-compile after "make clean"
-
-# LAMMPS ifdef settings
-# see possible settings in Section 3.5 of the manual
-
-LMP_INC = -DLAMMPS_GZIP
-
-# MPI library
-# see discussion in Section 3.4 of the manual
-# MPI wrapper compiler/linker can provide this info
-# can point to dummy MPI library in src/STUBS as in Makefile.serial
-# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
-# INC = path for mpi.h, MPI compiler settings
-# PATH = path for MPI library
-# LIB = name of MPI library
-
-MPI_INC    = 
-MPI_PATH   = 
-MPI_LIB    =
-
-MPI_INC += -DMPICH_SKIP_MPICXX 
-MPI_LIB += #/home/jhammond/OSPRI/branches/marpn/wrap/libmpiarbrpn.a
-
-# FFT library
-# see discussion in Section 3.5.2 of manual
-# can be left blank to use provided KISS FFT library
-# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
-# PATH = path for FFT library
-# LIB = name of FFT library
-
-FFT_INC  = -I/soft/libraries/alcf/current/xl/FFTW2/include -DFFT_FFTW2 -DFFTW_SIZE
-FFT_PATH = #/soft/libraries/alcf/current/xl/FFTW2
-FFT_LIB  = -L/soft/libraries/alcf/current/xl/FFTW2/lib -ldfftw
-
-# JPEG and/or PNG library
-# see discussion in Section 3.5.4 of manual
-# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
-# INC = path(s) for jpeglib.h and/or png.h
-# PATH = path(s) for JPEG library and/or PNG library
-# LIB = name(s) of JPEG library and/or PNG library
-
-JPG_INC =
-JPG_PATH =
-JPG_LIB =
-
-depend : fastdep.exe $(SRC)
-	@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
-
-fastdep.exe: ../DEPEND/fastdep.c
-	cc -O -o $@ $<
-
-sinclude .depend
diff --git a/src/MANIFOLD/fix_nve_manifold_rattle.cpp b/src/MANIFOLD/fix_nve_manifold_rattle.cpp
index b1efea951f..dc0492dbe9 100644
--- a/src/MANIFOLD/fix_nve_manifold_rattle.cpp
+++ b/src/MANIFOLD/fix_nve_manifold_rattle.cpp
@@ -287,21 +287,21 @@ void FixNVEManifoldRattle::update_var_params()
 
 /* -----------------------------------------------------------------------------
    ---------------------------------------------------------------------------*/
-int FixNVEManifoldRattle::dof(int /*igroup*/)
+bigint FixNVEManifoldRattle::dof(int /*igroup*/)
 {
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
-  int natoms = 0;
+  bigint natoms = 0;
   for (int i = 0; i < nlocal; ++i) {
     if (mask[i] & groupbit) ++natoms;
   }
 
-  int dofs;
-  MPI_Allreduce( &natoms, &dofs, 1, MPI_INT, MPI_SUM, world );
+  bigint dofs;
+  MPI_Allreduce( &natoms, &dofs, 1, MPI_LMP_BIGINT, MPI_SUM, world );
 
   // Make sure that, if there is just no or one atom, no dofs are subtracted,
   // since for the first atom already 3 dofs are subtracted because of the
-  // centre of mass corrections:
+  // center of mass corrections:
   if (dofs <= 1) dofs = 0;
   stats.dofs_removed = dofs;
 
diff --git a/src/MANIFOLD/fix_nve_manifold_rattle.h b/src/MANIFOLD/fix_nve_manifold_rattle.h
index 3eae9c4bc3..7c9e302094 100644
--- a/src/MANIFOLD/fix_nve_manifold_rattle.h
+++ b/src/MANIFOLD/fix_nve_manifold_rattle.h
@@ -75,7 +75,7 @@ class FixNVEManifoldRattle : public Fix {
   void init() override;
   void reset_dt() override;
   void end_of_step() override;
-  int dof(int) override;
+  bigint dof(int) override;
   void setup(int) override {}    // Not needed for fixNVE but is for fixNVT
   double memory_usage() override;
 
diff --git a/src/ML-SNAP/compute_sna_atom.cpp b/src/ML-SNAP/compute_sna_atom.cpp
index da49b15117..b1b4a46482 100644
--- a/src/ML-SNAP/compute_sna_atom.cpp
+++ b/src/ML-SNAP/compute_sna_atom.cpp
@@ -580,58 +580,56 @@ void ComputeSNAAtom::select3(int k, int n, double *arr, int *iarr, double **arr3
   }
 }
 
-double * ComputeSNAAtom::weights(double * rsq, double rcut, int ncounts)
+double *ComputeSNAAtom::weights(double *rsq, double rcut, int ncounts)
 {
-  double * w=nullptr;
+  double *w=nullptr;
   memory->destroy(w);
   memory->create(w, ncounts, "snann:gauss_weights");
   double rloc=0.;
-  for (int i=0; i<ncounts; i++)
-    {
-      rloc = sqrt(rsq[i]);
-      if (rloc > rcut){
-        w[i]=0.;
-      } else {
-        w[i]=1.;
-      }
+  for (int i=0; i<ncounts; i++) {
+    rloc = sqrt(rsq[i]);
+    if (rloc > rcut){
+      w[i]=0.;
+    } else {
+      w[i]=1.;
     }
+  }
   return w;
 }
 
-double * ComputeSNAAtom::tanh_weights(double * rsq, double rcut, double delta, int ncounts)
+double *ComputeSNAAtom::tanh_weights(double *rsq, double rcut, double delta, int ncounts)
 {
-  double * w=nullptr;
+  double *w=nullptr;
   memory->destroy(w);
   memory->create(w, ncounts, "snann:gauss_weights");
   double rloc=0.;
 
-  for (int i=0; i<ncounts; i++)
-    {
-      rloc = sqrt(rsq[i]);
-      w[i] = 0.5*(1.-tanh((rloc-rcut)/delta));
-    }
+  for (int i=0; i<ncounts; i++) {
+    rloc = sqrt(rsq[i]);
+    w[i] = 0.5*(1.-tanh((rloc-rcut)/delta));
+  }
   return w;
 }
 
-double ComputeSNAAtom::sum_weights(double * /*rsq*/, double * w, int ncounts)
+double ComputeSNAAtom::sum_weights(double * /*rsq*/, double *w, int ncounts)
 {
-  double S=0.;
-  for (int i=0; i<ncounts; i++)
-    {
-      S += w[i];
-    }
+  double S=0.0;
+  for (int i=0; i<ncounts; i++) {
+    S += w[i];
+  }
   return S;
 }
 
-double ComputeSNAAtom::get_target_rcut(double S_target, double * rsq, double rcut, int ncounts, int weightmode, double delta)
+double ComputeSNAAtom::get_target_rcut(double S_target, double *rsq, double rcut, int ncounts,
+                                       int weightmode, double delta)
 {
   double S_sol = 0.0;
   if (weightmode == 0) {
-    double * www = weights(rsq, rcut, ncounts);
+    double *www = weights(rsq, rcut, ncounts);
     S_sol = sum_weights(rsq, www, ncounts);
     memory->destroy(www);
   } else if (weightmode == 1) {
-    double * www = tanh_weights(rsq, rcut, delta, ncounts);
+    double *www = tanh_weights(rsq, rcut, delta, ncounts);
     S_sol = sum_weights(rsq, www, ncounts);
     memory->destroy(www);
   }
@@ -639,38 +637,31 @@ double ComputeSNAAtom::get_target_rcut(double S_target, double * rsq, double rcu
   return err;
 }
 
-double * ComputeSNAAtom::dichotomie(double S_target, double a, double b, double e, double * rsq, int ncounts, int weightmode, double delta)
+double *ComputeSNAAtom::dichotomie(double S_target, double a, double b, double e, double *rsq,
+                                   int ncounts, int weightmode, double delta)
 {
 
   double d=b-a;
-  double * sol = nullptr;
+  double *sol = nullptr;
   memory->destroy(sol);
   memory->create(sol, 2, "snann:sol");
-  double m=0.;
+  double m=0.0;
 
-  int cnt=0;
-  do
-    {
-      m = ( a + b ) / 2.;
-      d = fabs( b - a );
-      double f_ra = get_target_rcut(S_target, rsq, a, ncounts, weightmode, delta);
-      double f_rm = get_target_rcut(S_target, rsq, m, ncounts, weightmode, delta);
-      if (f_rm == 0.)
-        {
-          sol[0]=m;
-          sol[1]=m;
-          return sol;
-        }
-      else if (f_rm*f_ra > 0.)
-        {
-          a = m;
-        }
-      else
-        {
-          b = m;
-        }
-      cnt+=1;
-    } while ( d > e );
+  do {
+    m = (a + b) / 2.0;
+    d = fabs(b - a);
+    double f_ra = get_target_rcut(S_target, rsq, a, ncounts, weightmode, delta);
+    double f_rm = get_target_rcut(S_target, rsq, m, ncounts, weightmode, delta);
+    if (f_rm == 0.0) {
+      sol[0]=m;
+      sol[1]=m;
+      return sol;
+    } else if (f_rm*f_ra > 0.0) {
+      a = m;
+    } else {
+      b = m;
+    }
+  } while (d > e);
   sol[0]=a;
   sol[1]=b;
   return sol;
diff --git a/src/OPENMP/angle_cosine_periodic_omp.cpp b/src/OPENMP/angle_cosine_periodic_omp.cpp
index 43b3a54a47..48532c8f6c 100644
--- a/src/OPENMP/angle_cosine_periodic_omp.cpp
+++ b/src/OPENMP/angle_cosine_periodic_omp.cpp
@@ -140,7 +140,7 @@ void AngleCosinePeriodicOMP::eval(int nfrom, int nto, ThrData * const thr)
     tn = 1.0;
     tn_1 = 1.0;
     tn_2 = 0.0;
-    un = 1.0;
+    un = (m==1) ? 2.0 : 1.0;
     un_1 = 2.0;
     un_2 = 0.0;
 
diff --git a/src/OPENMP/angle_lepton_omp.cpp b/src/OPENMP/angle_lepton_omp.cpp
index 7e86a9e9bb..f57cf916a2 100644
--- a/src/OPENMP/angle_lepton_omp.cpp
+++ b/src/OPENMP/angle_lepton_omp.cpp
@@ -91,10 +91,17 @@ void AngleLeptonOMP::eval(int nfrom, int nto, ThrData *const thr)
 {
   std::vector<Lepton::CompiledExpression> angleforce;
   std::vector<Lepton::CompiledExpression> anglepot;
+  std::vector<bool> has_ref;
   try {
     for (const auto &expr : expressions) {
       auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(expr, Pointers::lmp));
       angleforce.emplace_back(parsed.differentiate("theta").createCompiledExpression());
+      has_ref.push_back(true);
+      try {
+        angleforce.back().getVariableReference("theta");
+      } catch (Lepton::Exception &) {
+        has_ref.back() = false;
+      }
       if (EFLAG) anglepot.emplace_back(parsed.createCompiledExpression());
     }
   } catch (std::exception &e) {
@@ -146,8 +153,7 @@ void AngleLeptonOMP::eval(int nfrom, int nto, ThrData *const thr)
 
     const double dtheta = acos(c) - theta0[type];
     const int idx = type2expression[type];
-    angleforce[idx].getVariableReference("theta") = dtheta;
-
+    if (has_ref[idx]) angleforce[idx].getVariableReference("theta") = dtheta;
     const double a = -angleforce[idx].evaluate() * s;
     const double a11 = a * c / rsq1;
     const double a12 = -a / (r1 * r2);
@@ -183,7 +189,11 @@ void AngleLeptonOMP::eval(int nfrom, int nto, ThrData *const thr)
 
     double eangle = 0.0;
     if (EFLAG) {
-      anglepot[idx].getVariableReference("theta") = dtheta;
+      try {
+        anglepot[idx].getVariableReference("theta") = dtheta;
+      } catch (Lepton::Exception &) {
+        ;    // ignore -> constant force
+      }
       eangle = anglepot[idx].evaluate() - offset[type];
     }
     if (EVFLAG)
diff --git a/src/OPENMP/bond_lepton_omp.cpp b/src/OPENMP/bond_lepton_omp.cpp
index 0029062366..d9982b08f8 100644
--- a/src/OPENMP/bond_lepton_omp.cpp
+++ b/src/OPENMP/bond_lepton_omp.cpp
@@ -89,10 +89,17 @@ void BondLeptonOMP::eval(int nfrom, int nto, ThrData *const thr)
 {
   std::vector<Lepton::CompiledExpression> bondforce;
   std::vector<Lepton::CompiledExpression> bondpot;
+  std::vector<bool> has_ref;
   try {
     for (const auto &expr : expressions) {
       auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(expr, Pointers::lmp));
       bondforce.emplace_back(parsed.differentiate("r").createCompiledExpression());
+      has_ref.push_back(true);
+      try {
+        bondforce.back().getVariableReference("r");
+      } catch (Lepton::Exception &) {
+        has_ref.back() = false;
+      }
       if (EFLAG) bondpot.emplace_back(parsed.createCompiledExpression());
     }
   } catch (std::exception &e) {
@@ -122,7 +129,7 @@ void BondLeptonOMP::eval(int nfrom, int nto, ThrData *const thr)
 
     double fbond = 0.0;
     if (r > 0.0) {
-      bondforce[idx].getVariableReference("r") = dr;
+      if (has_ref[idx]) bondforce[idx].getVariableReference("r") = dr;
       fbond = -bondforce[idx].evaluate() / r;
     }
 
@@ -142,7 +149,11 @@ void BondLeptonOMP::eval(int nfrom, int nto, ThrData *const thr)
 
     double ebond = 0.0;
     if (EFLAG) {
-      bondpot[idx].getVariableReference("r") = dr;
+      try {
+        bondpot[idx].getVariableReference("r") = dr;
+      } catch (Lepton::Exception &) {
+        ;    // ignore -> constant potential
+      }
       ebond = bondpot[idx].evaluate() - offset[type];
     }
     if (EVFLAG)
diff --git a/src/OPENMP/dihedral_lepton_omp.cpp b/src/OPENMP/dihedral_lepton_omp.cpp
index 13a1328058..37748ce9d5 100644
--- a/src/OPENMP/dihedral_lepton_omp.cpp
+++ b/src/OPENMP/dihedral_lepton_omp.cpp
@@ -19,9 +19,9 @@
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
+#include "math_extra.h"
 #include "neighbor.h"
 #include "suffix.h"
-#include "math_extra.h"
 
 #include <cmath>
 
@@ -94,10 +94,17 @@ void DihedralLeptonOMP::eval(int nfrom, int nto, ThrData *const thr)
 {
   std::vector<Lepton::CompiledExpression> dihedralforce;
   std::vector<Lepton::CompiledExpression> dihedralpot;
+  std::vector<bool> has_ref;
   try {
     for (const auto &expr : expressions) {
       auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(expr, Pointers::lmp));
       dihedralforce.emplace_back(parsed.differentiate("phi").createCompiledExpression());
+      has_ref.push_back(true);
+      try {
+        dihedralforce.back().getVariableReference("phi");
+      } catch (Lepton::Exception &) {
+        has_ref.back() = false;
+      }
       if (EFLAG) dihedralpot.emplace_back(parsed.createCompiledExpression());
     }
   } catch (std::exception &e) {
@@ -106,7 +113,7 @@ void DihedralLeptonOMP::eval(int nfrom, int nto, ThrData *const thr)
 
   const double *const *const x = atom->x;
   auto *_noalias const f = (dbl3_t *) thr->get_f()[0];
-  const int * const * const dihedrallist = neighbor->dihedrallist;
+  const int *const *const dihedrallist = neighbor->dihedrallist;
   const int nlocal = atom->nlocal;
 
   // The dihedral angle "phi" is the angle between n123 and n234
@@ -279,7 +286,7 @@ void DihedralLeptonOMP::eval(int nfrom, int nto, ThrData *const thr)
     }
 
     const int idx = type2expression[type];
-    dihedralforce[idx].getVariableReference("phi") = phi;
+    if (has_ref[idx]) dihedralforce[idx].getVariableReference("phi") = phi;
     double m_du_dphi = -dihedralforce[idx].evaluate();
 
     // ----- Step 4: Calculate the force direction in real space -----
@@ -323,7 +330,11 @@ void DihedralLeptonOMP::eval(int nfrom, int nto, ThrData *const thr)
 
     double edihedral = 0.0;
     if (EFLAG) {
-      dihedralpot[idx].getVariableReference("phi") = phi;
+      try {
+        dihedralpot[idx].getVariableReference("phi") = phi;
+      } catch (Lepton::Exception &) {
+        ;    // ignore -> constant potential
+      }
       edihedral = dihedralpot[idx].evaluate();
     }
     if (EVFLAG)
diff --git a/src/OPENMP/npair_respa_nsq_omp.h b/src/OPENMP/npair_respa_nsq_omp.h
index 810931674c..c68d06b4b5 100644
--- a/src/OPENMP/npair_respa_nsq_omp.h
+++ b/src/OPENMP/npair_respa_nsq_omp.h
@@ -15,7 +15,7 @@
 // clang-format off
 typedef NPairRespaNsqOmp<0,0> NPairHalfRespaNsqNewtoffOmp;
 NPairStyle(half/respa/nsq/newtoff/omp,
-           NPairHalfRespaNsqNewtoff,
+           NPairHalfRespaNsqNewtoffOmp,
            NP_HALF | NP_RESPA | NP_NSQ | NP_OMP | NP_NEWTOFF | NP_ORTHO | NP_TRI);
 
 typedef NPairRespaNsqOmp<1,0> NPairHalfRespaNsqNewtonOmp;
diff --git a/src/OPENMP/pair_lepton_coul_omp.cpp b/src/OPENMP/pair_lepton_coul_omp.cpp
index bc34bc00af..532c16d797 100644
--- a/src/OPENMP/pair_lepton_coul_omp.cpp
+++ b/src/OPENMP/pair_lepton_coul_omp.cpp
@@ -20,11 +20,13 @@
 #include "neigh_list.h"
 #include "suffix.h"
 
-#include <cmath>
-
 #include "Lepton.h"
 #include "lepton_utils.h"
 #include "omp_compat.h"
+
+#include <array>
+#include <cmath>
+
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
@@ -101,25 +103,30 @@ void PairLeptonCoulOMP::eval(int iifrom, int iito, ThrData *const thr)
 
   std::vector<Lepton::CompiledExpression> pairforce;
   std::vector<Lepton::CompiledExpression> pairpot;
-  std::vector<std::pair<bool, bool>> have_q;
+  std::vector<std::array<bool, 3>> has_ref;
   try {
     for (const auto &expr : expressions) {
       auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(expr, Pointers::lmp), functions);
       pairforce.emplace_back(parsed.differentiate("r").createCompiledExpression());
+      has_ref.push_back({true, true, true});
+      try {
+        pairforce.back().getVariableReference("r");
+      } catch (Lepton::Exception &) {
+        has_ref.back()[0] = false;
+      }
       if (EFLAG) pairpot.emplace_back(parsed.createCompiledExpression());
-      pairforce.back().getVariableReference("r");
-      have_q.emplace_back(true, true);
 
       // check if there are references to charges
+
       try {
         pairforce.back().getVariableReference("qi");
-      } catch (std::exception &) {
-        have_q.back().first = false;
+      } catch (Lepton::Exception &) {
+        has_ref.back()[1] = false;
       }
       try {
         pairforce.back().getVariableReference("qj");
-      } catch (std::exception &) {
-        have_q.back().second = false;
+      } catch (Lepton::Exception &) {
+        has_ref.back()[2] = false;
       }
     }
   } catch (std::exception &e) {
@@ -152,9 +159,9 @@ void PairLeptonCoulOMP::eval(int iifrom, int iito, ThrData *const thr)
       if (rsq < cutsq[itype][jtype]) {
         const double r = sqrt(rsq);
         const int idx = type2expression[itype][jtype];
-        pairforce[idx].getVariableReference("r") = r;
-        if (have_q[idx].first) pairforce[idx].getVariableReference("qi") = q2e * q[i];
-        if (have_q[idx].second) pairforce[idx].getVariableReference("qj") = q2e * q[j];
+        if (has_ref[idx][0]) pairforce[idx].getVariableReference("r") = r;
+        if (has_ref[idx][1]) pairforce[idx].getVariableReference("qi") = q2e * q[i];
+        if (has_ref[idx][2]) pairforce[idx].getVariableReference("qj") = q2e * q[j];
         const double fpair = -pairforce[idx].evaluate() / r * factor_coul;
 
         fxtmp += delx * fpair;
@@ -168,9 +175,14 @@ void PairLeptonCoulOMP::eval(int iifrom, int iito, ThrData *const thr)
 
         double ecoul = 0.0;
         if (EFLAG) {
-          pairpot[idx].getVariableReference("r") = r;
-          if (have_q[idx].first) pairpot[idx].getVariableReference("qi") = q2e * q[i];
-          if (have_q[idx].second) pairpot[idx].getVariableReference("qj") = q2e * q[j];
+          try {
+            pairpot[idx].getVariableReference("r") = r;
+          } catch (Lepton::Exception &) {
+            ;    // ignore -> constant potential
+          }
+          if (has_ref[idx][1]) pairpot[idx].getVariableReference("qi") = q2e * q[i];
+          if (has_ref[idx][2]) pairpot[idx].getVariableReference("qj") = q2e * q[j];
+
           ecoul = pairpot[idx].evaluate();
           ecoul *= factor_coul;
         }
diff --git a/src/OPENMP/pair_lepton_omp.cpp b/src/OPENMP/pair_lepton_omp.cpp
index b57b0fe11e..58692e52d6 100644
--- a/src/OPENMP/pair_lepton_omp.cpp
+++ b/src/OPENMP/pair_lepton_omp.cpp
@@ -20,11 +20,12 @@
 #include "neigh_list.h"
 #include "suffix.h"
 
-#include <cmath>
-
 #include "Lepton.h"
 #include "lepton_utils.h"
 #include "omp_compat.h"
+#include <array>
+#include <cmath>
+
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
@@ -96,10 +97,17 @@ void PairLeptonOMP::eval(int iifrom, int iito, ThrData *const thr)
 
   std::vector<Lepton::CompiledExpression> pairforce;
   std::vector<Lepton::CompiledExpression> pairpot;
+  std::vector<bool> have_ref;
   try {
     for (const auto &expr : expressions) {
       auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(expr, Pointers::lmp), functions);
       pairforce.emplace_back(parsed.differentiate("r").createCompiledExpression());
+      have_ref.push_back(true);
+      try {
+        pairforce.back().getVariableReference("r");
+      } catch (Lepton::Exception &) {
+        have_ref.back() = false;
+      }
       if (EFLAG) pairpot.emplace_back(parsed.createCompiledExpression());
     }
   } catch (std::exception &e) {
@@ -132,7 +140,7 @@ void PairLeptonOMP::eval(int iifrom, int iito, ThrData *const thr)
       if (rsq < cutsq[itype][jtype]) {
         const double r = sqrt(rsq);
         const int idx = type2expression[itype][jtype];
-        pairforce[idx].getVariableReference("r") = r;
+        if (have_ref[idx]) pairforce[idx].getVariableReference("r") = r;
         const double fpair = -pairforce[idx].evaluate() / r * factor_lj;
 
         fxtmp += delx * fpair;
@@ -146,7 +154,11 @@ void PairLeptonOMP::eval(int iifrom, int iito, ThrData *const thr)
 
         double evdwl = 0.0;
         if (EFLAG) {
-          pairpot[idx].getVariableReference("r") = r;
+          try {
+            pairpot[idx].getVariableReference("r") = r;
+          } catch (Lepton::Exception &) {
+            ;    // ignore -> constant potential
+          }
           evdwl = pairpot[idx].evaluate() - offset[itype][jtype];
           evdwl *= factor_lj;
         }
diff --git a/src/OPENMP/pair_lepton_sphere_omp.cpp b/src/OPENMP/pair_lepton_sphere_omp.cpp
index 6d3a4827b3..79afe27717 100644
--- a/src/OPENMP/pair_lepton_sphere_omp.cpp
+++ b/src/OPENMP/pair_lepton_sphere_omp.cpp
@@ -20,11 +20,13 @@
 #include "neigh_list.h"
 #include "suffix.h"
 
-#include <cmath>
-
 #include "Lepton.h"
 #include "lepton_utils.h"
 #include "omp_compat.h"
+
+#include <array>
+#include <cmath>
+
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
@@ -99,25 +101,30 @@ void PairLeptonSphereOMP::eval(int iifrom, int iito, ThrData *const thr)
 
   std::vector<Lepton::CompiledExpression> pairforce;
   std::vector<Lepton::CompiledExpression> pairpot;
-  std::vector<std::pair<bool, bool>> have_rad;
+  std::vector<std::array<bool, 3>> has_ref;
   try {
     for (const auto &expr : expressions) {
       auto parsed = Lepton::Parser::parse(LeptonUtils::substitute(expr, Pointers::lmp), functions);
       pairforce.emplace_back(parsed.differentiate("r").createCompiledExpression());
+      has_ref.push_back({true, true, true});
+      try {
+        pairforce.back().getVariableReference("r");
+      } catch (Lepton::Exception &) {
+        has_ref.back()[0] = false;
+      }
       if (EFLAG) pairpot.emplace_back(parsed.createCompiledExpression());
-      pairforce.back().getVariableReference("r");
-      have_rad.emplace_back(true, true);
 
-      // check if there are references to charges
+      // check if there are references to radii
+
       try {
         pairforce.back().getVariableReference("radi");
-      } catch (std::exception &) {
-        have_rad.back().first = false;
+      } catch (Lepton::Exception &) {
+        has_ref.back()[1] = false;
       }
       try {
         pairforce.back().getVariableReference("radj");
-      } catch (std::exception &) {
-        have_rad.back().second = false;
+      } catch (Lepton::Exception &) {
+        has_ref.back()[2] = false;
       }
     }
   } catch (std::exception &e) {
@@ -150,9 +157,9 @@ void PairLeptonSphereOMP::eval(int iifrom, int iito, ThrData *const thr)
       if (rsq < cutsq[itype][jtype]) {
         const double r = sqrt(rsq);
         const int idx = type2expression[itype][jtype];
-        pairforce[idx].getVariableReference("r") = r;
-        if (have_rad[idx].first) pairforce[idx].getVariableReference("radi") = radius[i];
-        if (have_rad[idx].second) pairforce[idx].getVariableReference("radj") = radius[j];
+        if (has_ref[idx][0]) pairforce[idx].getVariableReference("r") = r;
+        if (has_ref[idx][1]) pairforce[idx].getVariableReference("radi") = radius[i];
+        if (has_ref[idx][2]) pairforce[idx].getVariableReference("radj") = radius[j];
         const double fpair = -pairforce[idx].evaluate() / r * factor_lj;
 
         fxtmp += delx * fpair;
@@ -166,9 +173,14 @@ void PairLeptonSphereOMP::eval(int iifrom, int iito, ThrData *const thr)
 
         double evdwl = 0.0;
         if (EFLAG) {
-          pairpot[idx].getVariableReference("r") = r;
-          if (have_rad[idx].first) pairpot[idx].getVariableReference("radi") = radius[i];
-          if (have_rad[idx].second) pairpot[idx].getVariableReference("radj") = radius[j];
+          try {
+            pairpot[idx].getVariableReference("r") = r;
+          } catch (Lepton::Exception &) {
+            ;    // ignore -> constant potential
+          }
+          if (has_ref[idx][1]) pairpot[idx].getVariableReference("radi") = radius[i];
+          if (has_ref[idx][2]) pairpot[idx].getVariableReference("radj") = radius[j];
+
           evdwl = pairpot[idx].evaluate();
           evdwl *= factor_lj;
         }
diff --git a/src/POEMS/fix_poems.cpp b/src/POEMS/fix_poems.cpp
index f289a939e6..55199a7191 100644
--- a/src/POEMS/fix_poems.cpp
+++ b/src/POEMS/fix_poems.cpp
@@ -855,7 +855,7 @@ void FixPOEMS::pre_neighbor() {}
    count # of degrees-of-freedom removed by fix_poems for atoms in igroup
 ------------------------------------------------------------------------- */
 
-int FixPOEMS::dof(int igroup)
+bigint FixPOEMS::dof(int igroup)
 {
   int groupbit = group->bitmask[igroup];
 
@@ -877,17 +877,17 @@ int FixPOEMS::dof(int igroup)
 
   // remove 3N - 6 dof for each rigid body if at least 2 atoms are in igroup
 
-  int n = 0;
+  bigint n = 0;
   for (int ibody = 0; ibody < nbody; ibody++)
     if (nall[ibody] > 2) n += 3 * nall[ibody] - 6;
 
   // subtract 3 additional dof for each joint if atom is also in igroup
 
-  int m = 0;
+  bigint m = 0;
   for (int i = 0; i < nlocal; i++)
     if (natom2body[i] > 1 && (mask[i] & groupbit)) m += 3 * (natom2body[i] - 1);
-  int mall;
-  MPI_Allreduce(&m, &mall, 1, MPI_INT, MPI_SUM, world);
+  bigint mall;
+  MPI_Allreduce(&m, &mall, 1, MPI_LMP_BIGINT, MPI_SUM, world);
   n += mall;
 
   // delete local memory
diff --git a/src/POEMS/fix_poems.h b/src/POEMS/fix_poems.h
index 99af171636..6aac4abd8a 100644
--- a/src/POEMS/fix_poems.h
+++ b/src/POEMS/fix_poems.h
@@ -47,7 +47,7 @@ class FixPOEMS : public Fix {
   double memory_usage() override;
 
   void pre_neighbor() override;
-  int dof(int) override;
+  bigint dof(int) override;
   void deform(int) override;
   int modify_param(int, char **) override;
   void reset_dt() override;
diff --git a/src/QEQ/fix_qeq.cpp b/src/QEQ/fix_qeq.cpp
index b60438b7c8..22632cf786 100644
--- a/src/QEQ/fix_qeq.cpp
+++ b/src/QEQ/fix_qeq.cpp
@@ -338,12 +338,6 @@ void FixQEq::setup_pre_force(int vflag)
   if (force->newton_pair == 0)
     error->all(FLERR,"QEQ with 'newton pair off' not supported");
 
-  if (force->pair) {
-    if (force->pair->suffix_flag & (Suffix::INTEL|Suffix::GPU))
-      error->all(FLERR,"QEQ is not compatiple with suffix version "
-                 "of pair style");
-  }
-
   deallocate_storage();
   allocate_storage();
 
diff --git a/src/REACTION/README b/src/REACTION/README
index 99a5d604ec..b9199d6d47 100644
--- a/src/REACTION/README
+++ b/src/REACTION/README
@@ -25,4 +25,5 @@ The REACTER methodology is detailed in:
     https://doi.org/10.1021/acs.macromol.0c02012
 
 This package was created by Jacob Gissinger
-(jacob.r.gissinger@gmail.com) at the NASA Langley Research Center.
+(jgissing@stevens.edu) while at the NASA Langley Research Center
+and Stevens Institute of Technology.
diff --git a/src/REACTION/fix_bond_react.cpp b/src/REACTION/fix_bond_react.cpp
index d124b06dc2..786f5bfe6e 100644
--- a/src/REACTION/fix_bond_react.cpp
+++ b/src/REACTION/fix_bond_react.cpp
@@ -13,7 +13,7 @@ See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-Contributing Author: Jacob Gissinger (jacob.r.gissinger@gmail.com)
+Contributing Author: Jacob Gissinger (jgissing@stevens.edu)
 ------------------------------------------------------------------------- */
 
 #include "fix_bond_react.h"
@@ -670,15 +670,6 @@ FixBondReact::~FixBondReact()
   memory->destroy(ghostly_rxn_count);
   memory->destroy(reaction_count_total);
 
-  if (newton_bond == 0) {
-    memory->destroy(xspecial);
-    memory->destroy(nxspecial);
-    memory->destroy(onemol_xspecial);
-    memory->destroy(onemol_nxspecial);
-    memory->destroy(twomol_xspecial);
-    memory->destroy(twomol_nxspecial);
-  }
-
   if (attempted_rxn == 1) {
     memory->destroy(restore_pt);
     memory->destroy(restore);
@@ -827,11 +818,10 @@ void FixBondReact::init()
     nlevels_respa = (dynamic_cast<Respa *>(update->integrate))->nlevels;
 
   // check cutoff for iatomtype,jatomtype
-  for (int i = 0; i < nreacts; i++) {
-    if (!utils::strmatch(force->pair_style,"^hybrid"))
-      if (force->pair == nullptr || cutsq[i][1] > force->pair->cutsq[iatomtype[i]][jatomtype[i]])
+  if (!utils::strmatch(force->pair_style,"^hybrid"))
+    for (int i = 0; i < nreacts; i++)
+      if (force->pair == nullptr || (closeneigh[i] < 0 && cutsq[i][1] > force->pair->cutsq[iatomtype[i]][jatomtype[i]]))
         error->all(FLERR,"Fix bond/react: Fix bond/react cutoff is longer than pairwise cutoff");
-  }
 
   // need a half neighbor list, built every Nevery steps
   neighbor->add_request(this, NeighConst::REQ_OCCASIONAL);
@@ -931,29 +921,10 @@ void FixBondReact::post_integrate()
 
   neighbor->build_one(list,1);
 
-  // here we define a full special list, independent of Newton setting
-  if (newton_bond == 1) {
-    nxspecial = atom->nspecial;
-    xspecial = atom->special;
-  } else {
-    int nall = atom->nlocal + atom->nghost;
-    memory->destroy(nxspecial);
-    memory->destroy(xspecial);
-    memory->create(nxspecial,nall,3,"bond/react:nxspecial");
-    memory->create(xspecial,nall,atom->maxspecial,"bond/react:xspecial");
-    for (int i = 0; i < atom->nlocal; i++) {
-      nxspecial[i][0] = atom->num_bond[i];
-      for (int j = 0; j < nxspecial[i][0]; j++) {
-        xspecial[i][j] = atom->bond_atom[i][j];
-      }
-      nxspecial[i][1] = atom->nspecial[i][1];
-      nxspecial[i][2] = atom->nspecial[i][2];
-      int joffset = nxspecial[i][0] - atom->nspecial[i][0];
-      for (int j = nxspecial[i][0]; j < nxspecial[i][2]; j++) {
-        xspecial[i][j+joffset] = atom->special[i][j];
-      }
-    }
-  }
+  // here we define a full special list
+  // may need correction for unusual special bond settings
+  nxspecial = atom->nspecial;
+  xspecial = atom->special;
 
   int j;
   for (rxnID = 0; rxnID < nreacts; rxnID++) {
@@ -2541,49 +2512,15 @@ int FixBondReact::get_chirality(double four_coords[12])
 
 /* ----------------------------------------------------------------------
   Get xspecials for current molecule templates
+  may need correction when specials defined explicitly in molecule templates
 ------------------------------------------------------------------------- */
 
 void FixBondReact::get_molxspecials()
 {
-  if (newton_bond == 1) {
-    onemol_nxspecial = onemol->nspecial;
-    onemol_xspecial = onemol->special;
-    twomol_nxspecial = twomol->nspecial;
-    twomol_xspecial = twomol->special;
-  } else {
-    memory->destroy(onemol_nxspecial);
-    memory->destroy(onemol_xspecial);
-    memory->create(onemol_nxspecial,onemol->natoms,3,"bond/react:onemol_nxspecial");
-    memory->create(onemol_xspecial,onemol->natoms,atom->maxspecial,"bond/react:onemol_xspecial");
-    for (int i = 0; i < onemol->natoms; i++) {
-      onemol_nxspecial[i][0] = onemol->num_bond[i];
-      for (int j = 0; j < onemol_nxspecial[i][0]; j++) {
-        onemol_xspecial[i][j] = onemol->bond_atom[i][j];
-      }
-      onemol_nxspecial[i][1] = onemol->nspecial[i][1];
-      onemol_nxspecial[i][2] = onemol->nspecial[i][2];
-      int joffset = onemol_nxspecial[i][0] - onemol->nspecial[i][0];
-      for (int j = onemol_nxspecial[i][0]; j < onemol_nxspecial[i][2]; j++) {
-        onemol_xspecial[i][j+joffset] = onemol->special[i][j];
-      }
-    }
-    memory->destroy(twomol_nxspecial);
-    memory->destroy(twomol_xspecial);
-    memory->create(twomol_nxspecial,twomol->natoms,3,"bond/react:twomol_nxspecial");
-    memory->create(twomol_xspecial,twomol->natoms,atom->maxspecial,"bond/react:twomol_xspecial");
-    for (int i = 0; i < twomol->natoms; i++) {
-      twomol_nxspecial[i][0] = twomol->num_bond[i];
-      for (int j = 0; j < twomol_nxspecial[i][0]; j++) {
-        twomol_xspecial[i][j] = twomol->bond_atom[i][j];
-      }
-      twomol_nxspecial[i][1] = twomol->nspecial[i][1];
-      twomol_nxspecial[i][2] = twomol->nspecial[i][2];
-      int joffset = twomol_nxspecial[i][0] - twomol->nspecial[i][0];
-      for (int j = twomol_nxspecial[i][0]; j < twomol_nxspecial[i][2]; j++) {
-        twomol_xspecial[i][j+joffset] = twomol->special[i][j];
-      }
-    }
-  }
+  onemol_nxspecial = onemol->nspecial;
+  onemol_xspecial = onemol->special;
+  twomol_nxspecial = twomol->nspecial;
+  twomol_xspecial = twomol->special;
 }
 
 /* ----------------------------------------------------------------------
@@ -2682,16 +2619,43 @@ void FixBondReact::find_landlocked_atoms(int myrxn)
   }
 
   // also, if atoms change number of bonds, but aren't landlocked, that could be bad
+  int warnflag = 0;
   if (comm->me == 0)
     for (int i = 0; i < twomol->natoms; i++) {
       if ((create_atoms[i][myrxn] == 0) &&
           (twomol_nxspecial[i][0] != onemol_nxspecial[equivalences[i][1][myrxn]-1][0]) &&
-          (landlocked_atoms[i][myrxn] == 0))
-        error->warning(FLERR, "Fix bond/react: Atom affected by reaction {} is too close "
-                       "to template edge",rxn_name[myrxn]);
-          break;
+          (landlocked_atoms[i][myrxn] == 0)) {
+        warnflag = 1;
+        break;
+      }
     }
 
+  // also, if an atom changes any of its bonds, but is not landlocked, that could be bad
+  int thereflag;
+  if (comm->me == 0)
+    for (int i = 0; i < twomol->natoms; i++) {
+      if (landlocked_atoms[i][myrxn] == 1) continue;
+      for (int j = 0; j < twomol_nxspecial[i][0]; j++) {
+        int oneneighID = equivalences[twomol_xspecial[i][j]-1][1][myrxn];
+        int ii = equivalences[i][1][myrxn] - 1;
+        thereflag = 0;
+        for (int k = 0; k < onemol_nxspecial[ii][0]; k++) {
+          if (oneneighID == onemol_xspecial[ii][k]) {
+            thereflag = 1;
+            break;
+          }
+        }
+        if (thereflag == 0) {
+          warnflag = 1;
+          break;
+        }
+      }
+      if (warnflag == 1) break;
+    }
+
+  if (comm->me == 0 && warnflag == 1) error->warning(FLERR, "Fix bond/react: Atom affected "
+                       "by reaction {} is too close to template edge",rxn_name[myrxn]);
+
   // finally, if a created atom is not landlocked, bad!
   for (int i = 0; i < twomol->natoms; i++) {
     if (create_atoms[i][myrxn] == 1 && landlocked_atoms[i][myrxn] == 0) {
@@ -3349,7 +3313,7 @@ void FixBondReact::update_everything()
         dynamic_cast<FixBondHistory *>(ihistory)->clear_cache();
 
     // Angles! First let's delete all angle info:
-    if (force->angle && twomol->angleflag) {
+    if (force->angle) {
       int *num_angle = atom->num_angle;
       int **angle_type = atom->angle_type;
       tagint **angle_atom1 = atom->angle_atom1;
@@ -3390,33 +3354,35 @@ void FixBondReact::update_everything()
           }
         }
         // now let's add the new angle info.
-        for (int j = 0; j < twomol->natoms; j++) {
-          int jj = equivalences[j][1][rxnID]-1;
-          if (atom->map(update_mega_glove[jj+1][i]) < nlocal && atom->map(update_mega_glove[jj+1][i]) >= 0) {
-            if (landlocked_atoms[j][rxnID] == 1) {
-              num_angle[atom->map(update_mega_glove[jj+1][i])] = twomol->num_angle[j];
-              delta_angle += twomol->num_angle[j];
-              for (int p = 0; p < twomol->num_angle[j]; p++) {
-                angle_type[atom->map(update_mega_glove[jj+1][i])][p] = twomol->angle_type[j][p];
-                angle_atom1[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->angle_atom1[j][p]-1][1][rxnID]][i];
-                angle_atom2[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->angle_atom2[j][p]-1][1][rxnID]][i];
-                angle_atom3[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->angle_atom3[j][p]-1][1][rxnID]][i];
+        if (twomol->angleflag) {
+          for (int j = 0; j < twomol->natoms; j++) {
+            int jj = equivalences[j][1][rxnID]-1;
+            if (atom->map(update_mega_glove[jj+1][i]) < nlocal && atom->map(update_mega_glove[jj+1][i]) >= 0) {
+              if (landlocked_atoms[j][rxnID] == 1) {
+                num_angle[atom->map(update_mega_glove[jj+1][i])] = twomol->num_angle[j];
+                delta_angle += twomol->num_angle[j];
+                for (int p = 0; p < twomol->num_angle[j]; p++) {
+                  angle_type[atom->map(update_mega_glove[jj+1][i])][p] = twomol->angle_type[j][p];
+                  angle_atom1[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->angle_atom1[j][p]-1][1][rxnID]][i];
+                  angle_atom2[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->angle_atom2[j][p]-1][1][rxnID]][i];
+                  angle_atom3[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->angle_atom3[j][p]-1][1][rxnID]][i];
+                }
               }
-            }
-            if (landlocked_atoms[j][rxnID] == 0) {
-              for (int p = 0; p < twomol->num_angle[j]; p++) {
-                if (landlocked_atoms[twomol->angle_atom1[j][p]-1][rxnID] == 1 ||
-                    landlocked_atoms[twomol->angle_atom2[j][p]-1][rxnID] == 1 ||
-                    landlocked_atoms[twomol->angle_atom3[j][p]-1][rxnID] == 1) {
-                  insert_num = num_angle[atom->map(update_mega_glove[jj+1][i])];
-                  angle_type[atom->map(update_mega_glove[jj+1][i])][insert_num] = twomol->angle_type[j][p];
-                  angle_atom1[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->angle_atom1[j][p]-1][1][rxnID]][i];
-                  angle_atom2[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->angle_atom2[j][p]-1][1][rxnID]][i];
-                  angle_atom3[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->angle_atom3[j][p]-1][1][rxnID]][i];
-                  num_angle[atom->map(update_mega_glove[jj+1][i])]++;
-                  if (num_angle[atom->map(update_mega_glove[jj+1][i])] > atom->angle_per_atom)
-                    error->one(FLERR,"Fix bond/react topology/atom exceed system topology/atom");
-                  delta_angle++;
+              if (landlocked_atoms[j][rxnID] == 0) {
+                for (int p = 0; p < twomol->num_angle[j]; p++) {
+                  if (landlocked_atoms[twomol->angle_atom1[j][p]-1][rxnID] == 1 ||
+                      landlocked_atoms[twomol->angle_atom2[j][p]-1][rxnID] == 1 ||
+                      landlocked_atoms[twomol->angle_atom3[j][p]-1][rxnID] == 1) {
+                    insert_num = num_angle[atom->map(update_mega_glove[jj+1][i])];
+                    angle_type[atom->map(update_mega_glove[jj+1][i])][insert_num] = twomol->angle_type[j][p];
+                    angle_atom1[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->angle_atom1[j][p]-1][1][rxnID]][i];
+                    angle_atom2[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->angle_atom2[j][p]-1][1][rxnID]][i];
+                    angle_atom3[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->angle_atom3[j][p]-1][1][rxnID]][i];
+                    num_angle[atom->map(update_mega_glove[jj+1][i])]++;
+                    if (num_angle[atom->map(update_mega_glove[jj+1][i])] > atom->angle_per_atom)
+                      error->one(FLERR,"Fix bond/react topology/atom exceed system topology/atom");
+                    delta_angle++;
+                  }
                 }
               }
             }
@@ -3426,7 +3392,7 @@ void FixBondReact::update_everything()
     }
 
     // Dihedrals! first let's delete all dihedral info for landlocked atoms
-    if (force->dihedral && twomol->dihedralflag) {
+    if (force->dihedral) {
       int *num_dihedral = atom->num_dihedral;
       int **dihedral_type = atom->dihedral_type;
       tagint **dihedral_atom1 = atom->dihedral_atom1;
@@ -3470,36 +3436,38 @@ void FixBondReact::update_everything()
           }
         }
         // now let's add new dihedral info
-        for (int j = 0; j < twomol->natoms; j++) {
-          int jj = equivalences[j][1][rxnID]-1;
-          if (atom->map(update_mega_glove[jj+1][i]) < nlocal && atom->map(update_mega_glove[jj+1][i]) >= 0) {
-            if (landlocked_atoms[j][rxnID] == 1) {
-              num_dihedral[atom->map(update_mega_glove[jj+1][i])] = twomol->num_dihedral[j];
-              delta_dihed += twomol->num_dihedral[j];
-              for (int p = 0; p < twomol->num_dihedral[j]; p++) {
-                dihedral_type[atom->map(update_mega_glove[jj+1][i])][p] = twomol->dihedral_type[j][p];
-                dihedral_atom1[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->dihedral_atom1[j][p]-1][1][rxnID]][i];
-                dihedral_atom2[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->dihedral_atom2[j][p]-1][1][rxnID]][i];
-                dihedral_atom3[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->dihedral_atom3[j][p]-1][1][rxnID]][i];
-                dihedral_atom4[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->dihedral_atom4[j][p]-1][1][rxnID]][i];
+        if (twomol->dihedralflag) {
+          for (int j = 0; j < twomol->natoms; j++) {
+            int jj = equivalences[j][1][rxnID]-1;
+            if (atom->map(update_mega_glove[jj+1][i]) < nlocal && atom->map(update_mega_glove[jj+1][i]) >= 0) {
+              if (landlocked_atoms[j][rxnID] == 1) {
+                num_dihedral[atom->map(update_mega_glove[jj+1][i])] = twomol->num_dihedral[j];
+                delta_dihed += twomol->num_dihedral[j];
+                for (int p = 0; p < twomol->num_dihedral[j]; p++) {
+                  dihedral_type[atom->map(update_mega_glove[jj+1][i])][p] = twomol->dihedral_type[j][p];
+                  dihedral_atom1[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->dihedral_atom1[j][p]-1][1][rxnID]][i];
+                  dihedral_atom2[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->dihedral_atom2[j][p]-1][1][rxnID]][i];
+                  dihedral_atom3[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->dihedral_atom3[j][p]-1][1][rxnID]][i];
+                  dihedral_atom4[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->dihedral_atom4[j][p]-1][1][rxnID]][i];
+                }
               }
-            }
-            if (landlocked_atoms[j][rxnID] == 0) {
-              for (int p = 0; p < twomol->num_dihedral[j]; p++) {
-                if (landlocked_atoms[twomol->dihedral_atom1[j][p]-1][rxnID] == 1 ||
-                    landlocked_atoms[twomol->dihedral_atom2[j][p]-1][rxnID] == 1 ||
-                    landlocked_atoms[twomol->dihedral_atom3[j][p]-1][rxnID] == 1 ||
-                    landlocked_atoms[twomol->dihedral_atom4[j][p]-1][rxnID] == 1) {
-                  insert_num = num_dihedral[atom->map(update_mega_glove[jj+1][i])];
-                  dihedral_type[atom->map(update_mega_glove[jj+1][i])][insert_num] = twomol->dihedral_type[j][p];
-                  dihedral_atom1[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->dihedral_atom1[j][p]-1][1][rxnID]][i];
-                  dihedral_atom2[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->dihedral_atom2[j][p]-1][1][rxnID]][i];
-                  dihedral_atom3[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->dihedral_atom3[j][p]-1][1][rxnID]][i];
-                  dihedral_atom4[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->dihedral_atom4[j][p]-1][1][rxnID]][i];
-                  num_dihedral[atom->map(update_mega_glove[jj+1][i])]++;
-                  if (num_dihedral[atom->map(update_mega_glove[jj+1][i])] > atom->dihedral_per_atom)
-                    error->one(FLERR,"Fix bond/react topology/atom exceed system topology/atom");
-                  delta_dihed++;
+              if (landlocked_atoms[j][rxnID] == 0) {
+                for (int p = 0; p < twomol->num_dihedral[j]; p++) {
+                  if (landlocked_atoms[twomol->dihedral_atom1[j][p]-1][rxnID] == 1 ||
+                      landlocked_atoms[twomol->dihedral_atom2[j][p]-1][rxnID] == 1 ||
+                      landlocked_atoms[twomol->dihedral_atom3[j][p]-1][rxnID] == 1 ||
+                      landlocked_atoms[twomol->dihedral_atom4[j][p]-1][rxnID] == 1) {
+                    insert_num = num_dihedral[atom->map(update_mega_glove[jj+1][i])];
+                    dihedral_type[atom->map(update_mega_glove[jj+1][i])][insert_num] = twomol->dihedral_type[j][p];
+                    dihedral_atom1[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->dihedral_atom1[j][p]-1][1][rxnID]][i];
+                    dihedral_atom2[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->dihedral_atom2[j][p]-1][1][rxnID]][i];
+                    dihedral_atom3[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->dihedral_atom3[j][p]-1][1][rxnID]][i];
+                    dihedral_atom4[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->dihedral_atom4[j][p]-1][1][rxnID]][i];
+                    num_dihedral[atom->map(update_mega_glove[jj+1][i])]++;
+                    if (num_dihedral[atom->map(update_mega_glove[jj+1][i])] > atom->dihedral_per_atom)
+                      error->one(FLERR,"Fix bond/react topology/atom exceed system topology/atom");
+                    delta_dihed++;
+                  }
                 }
               }
             }
@@ -3509,7 +3477,7 @@ void FixBondReact::update_everything()
     }
 
     // finally IMPROPERS!!!! first let's delete all improper info for landlocked atoms
-    if (force->improper && twomol->improperflag) {
+    if (force->improper) {
       int *num_improper = atom->num_improper;
       int **improper_type = atom->improper_type;
       tagint **improper_atom1 = atom->improper_atom1;
@@ -3553,36 +3521,38 @@ void FixBondReact::update_everything()
           }
         }
         // now let's add new improper info
-        for (int j = 0; j < twomol->natoms; j++) {
-          int jj = equivalences[j][1][rxnID]-1;
-          if (atom->map(update_mega_glove[jj+1][i]) < nlocal && atom->map(update_mega_glove[jj+1][i]) >= 0) {
-            if (landlocked_atoms[j][rxnID] == 1) {
-              num_improper[atom->map(update_mega_glove[jj+1][i])] = twomol->num_improper[j];
-              delta_imprp += twomol->num_improper[j];
-              for (int p = 0; p < twomol->num_improper[j]; p++) {
-                improper_type[atom->map(update_mega_glove[jj+1][i])][p] = twomol->improper_type[j][p];
-                improper_atom1[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->improper_atom1[j][p]-1][1][rxnID]][i];
-                improper_atom2[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->improper_atom2[j][p]-1][1][rxnID]][i];
-                improper_atom3[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->improper_atom3[j][p]-1][1][rxnID]][i];
-                improper_atom4[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->improper_atom4[j][p]-1][1][rxnID]][i];
+        if (twomol->improperflag) {
+          for (int j = 0; j < twomol->natoms; j++) {
+            int jj = equivalences[j][1][rxnID]-1;
+            if (atom->map(update_mega_glove[jj+1][i]) < nlocal && atom->map(update_mega_glove[jj+1][i]) >= 0) {
+              if (landlocked_atoms[j][rxnID] == 1) {
+                num_improper[atom->map(update_mega_glove[jj+1][i])] = twomol->num_improper[j];
+                delta_imprp += twomol->num_improper[j];
+                for (int p = 0; p < twomol->num_improper[j]; p++) {
+                  improper_type[atom->map(update_mega_glove[jj+1][i])][p] = twomol->improper_type[j][p];
+                  improper_atom1[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->improper_atom1[j][p]-1][1][rxnID]][i];
+                  improper_atom2[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->improper_atom2[j][p]-1][1][rxnID]][i];
+                  improper_atom3[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->improper_atom3[j][p]-1][1][rxnID]][i];
+                  improper_atom4[atom->map(update_mega_glove[jj+1][i])][p] = update_mega_glove[equivalences[twomol->improper_atom4[j][p]-1][1][rxnID]][i];
+                }
               }
-            }
-            if (landlocked_atoms[j][rxnID] == 0) {
-              for (int p = 0; p < twomol->num_improper[j]; p++) {
-                if (landlocked_atoms[twomol->improper_atom1[j][p]-1][rxnID] == 1 ||
-                    landlocked_atoms[twomol->improper_atom2[j][p]-1][rxnID] == 1 ||
-                    landlocked_atoms[twomol->improper_atom3[j][p]-1][rxnID] == 1 ||
-                    landlocked_atoms[twomol->improper_atom4[j][p]-1][rxnID] == 1) {
-                  insert_num = num_improper[atom->map(update_mega_glove[jj+1][i])];
-                  improper_type[atom->map(update_mega_glove[jj+1][i])][insert_num] = twomol->improper_type[j][p];
-                  improper_atom1[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->improper_atom1[j][p]-1][1][rxnID]][i];
-                  improper_atom2[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->improper_atom2[j][p]-1][1][rxnID]][i];
-                  improper_atom3[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->improper_atom3[j][p]-1][1][rxnID]][i];
-                  improper_atom4[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->improper_atom4[j][p]-1][1][rxnID]][i];
-                  num_improper[atom->map(update_mega_glove[jj+1][i])]++;
-                  if (num_improper[atom->map(update_mega_glove[jj+1][i])] > atom->improper_per_atom)
-                    error->one(FLERR,"Fix bond/react topology/atom exceed system topology/atom");
-                  delta_imprp++;
+              if (landlocked_atoms[j][rxnID] == 0) {
+                for (int p = 0; p < twomol->num_improper[j]; p++) {
+                  if (landlocked_atoms[twomol->improper_atom1[j][p]-1][rxnID] == 1 ||
+                      landlocked_atoms[twomol->improper_atom2[j][p]-1][rxnID] == 1 ||
+                      landlocked_atoms[twomol->improper_atom3[j][p]-1][rxnID] == 1 ||
+                      landlocked_atoms[twomol->improper_atom4[j][p]-1][rxnID] == 1) {
+                    insert_num = num_improper[atom->map(update_mega_glove[jj+1][i])];
+                    improper_type[atom->map(update_mega_glove[jj+1][i])][insert_num] = twomol->improper_type[j][p];
+                    improper_atom1[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->improper_atom1[j][p]-1][1][rxnID]][i];
+                    improper_atom2[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->improper_atom2[j][p]-1][1][rxnID]][i];
+                    improper_atom3[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->improper_atom3[j][p]-1][1][rxnID]][i];
+                    improper_atom4[atom->map(update_mega_glove[jj+1][i])][insert_num] = update_mega_glove[equivalences[twomol->improper_atom4[j][p]-1][1][rxnID]][i];
+                    num_improper[atom->map(update_mega_glove[jj+1][i])]++;
+                    if (num_improper[atom->map(update_mega_glove[jj+1][i])] > atom->improper_per_atom)
+                      error->one(FLERR,"Fix bond/react topology/atom exceed system topology/atom");
+                    delta_imprp++;
+                  }
                 }
               }
             }
@@ -3895,7 +3865,8 @@ int FixBondReact::insert_atoms(tagint **my_update_mega_glove, int iupdate)
         // guess a somewhat reasonable initial velocity based on reaction site
         // further control is possible using bond_react_MASTER_group
         // compute |velocity| corresponding to a given temperature t, using specific atom's mass
-        double vtnorm = sqrt(t / (force->mvv2e / (dimension * force->boltz)) / atom->mass[twomol->type[m]]);
+        double mymass = atom->rmass ? atom->rmass[n] : atom->mass[twomol->type[m]];
+        double vtnorm = sqrt(t / (force->mvv2e / (dimension * force->boltz)) / mymass);
         v[n][0] = random[rxnID]->uniform();
         v[n][1] = random[rxnID]->uniform();
         v[n][2] = random[rxnID]->uniform();
diff --git a/src/REACTION/fix_bond_react.h b/src/REACTION/fix_bond_react.h
index 534261e11d..8c9fc9dce4 100644
--- a/src/REACTION/fix_bond_react.h
+++ b/src/REACTION/fix_bond_react.h
@@ -12,7 +12,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing Author: Jacob Gissinger (jacob.r.gissinger@gmail.com)
+   Contributing Author: Jacob Gissinger (jgissing@stevens.edu)
 ------------------------------------------------------------------------- */
 
 #ifdef FIX_CLASS
@@ -139,7 +139,7 @@ class FixBondReact : public Fix {
   int avail_guesses;     // num of restore points available
   int *guess_branch;     // used when there is more than two choices when guessing
   int **restore_pt;      // contains info about restore points
-  tagint **restore;      // contaings info about restore points
+  tagint **restore;      // contains info about restore points
   int *pioneer_count;    // counts pioneers
 
   int **edge;                // atoms in molecule templates with incorrect valences
diff --git a/src/RIGID/fix_rigid.cpp b/src/RIGID/fix_rigid.cpp
index 628abb240e..bd3c53e3ec 100644
--- a/src/RIGID/fix_rigid.cpp
+++ b/src/RIGID/fix_rigid.cpp
@@ -1247,7 +1247,7 @@ void FixRigid::enforce2d()
    return total count of DOF
 ------------------------------------------------------------------------- */
 
-int FixRigid::dof(int tgroup)
+bigint FixRigid::dof(int tgroup)
 {
   // cannot count DOF correctly unless setup_bodies_static() has been called
 
@@ -1306,7 +1306,7 @@ int FixRigid::dof(int tgroup)
   // 3d body with any finite-size M should have 6 dof, remove (3N+6M) - 6
   // 2d body with any finite-size M should have 3 dof, remove (2N+3M) - 3
 
-  int n = 0;
+  bigint n = 0;
   nlinear = 0;
   if (domain->dimension == 3) {
     for (int ibody = 0; ibody < nbody; ibody++)
diff --git a/src/RIGID/fix_rigid.h b/src/RIGID/fix_rigid.h
index 361ddd2720..c2f04ecf1a 100644
--- a/src/RIGID/fix_rigid.h
+++ b/src/RIGID/fix_rigid.h
@@ -48,7 +48,7 @@ class FixRigid : public Fix {
 
   void setup_pre_neighbor() override;
   void pre_neighbor() override;
-  int dof(int) override;
+  bigint dof(int) override;
   void deform(int) override;
   void reset_dt() override;
   void zero_momentum() override;
diff --git a/src/RIGID/fix_rigid_small.cpp b/src/RIGID/fix_rigid_small.cpp
index bd49834f15..5905e44595 100644
--- a/src/RIGID/fix_rigid_small.cpp
+++ b/src/RIGID/fix_rigid_small.cpp
@@ -1123,7 +1123,7 @@ void FixRigidSmall::enforce2d()
    return total count of DOF
 ------------------------------------------------------------------------- */
 
-int FixRigidSmall::dof(int tgroup)
+bigint FixRigidSmall::dof(int tgroup)
 {
   int i,j;
 
@@ -1195,7 +1195,7 @@ int FixRigidSmall::dof(int tgroup)
 
   double *inertia;
 
-  int n = 0;
+  bigint n = 0;
   nlinear = 0;
   if (domain->dimension == 3) {
     for (int ibody = 0; ibody < nlocal_body; ibody++) {
@@ -1216,8 +1216,8 @@ int FixRigidSmall::dof(int tgroup)
 
   memory->destroy(counts);
 
-  int nall;
-  MPI_Allreduce(&n,&nall,1,MPI_INT,MPI_SUM,world);
+  bigint nall;
+  MPI_Allreduce(&n,&nall,1,MPI_LMP_BIGINT,MPI_SUM,world);
   return nall;
 }
 
diff --git a/src/RIGID/fix_rigid_small.h b/src/RIGID/fix_rigid_small.h
index 0070d976df..0508063f05 100644
--- a/src/RIGID/fix_rigid_small.h
+++ b/src/RIGID/fix_rigid_small.h
@@ -54,7 +54,7 @@ class FixRigidSmall : public Fix {
 
   void setup_pre_neighbor() override;
   void pre_neighbor() override;
-  int dof(int) override;
+  bigint dof(int) override;
   void deform(int) override;
   void reset_dt() override;
   void zero_momentum() override;
diff --git a/src/RIGID/fix_shake.cpp b/src/RIGID/fix_shake.cpp
index b2c65220bc..15bd5d207f 100644
--- a/src/RIGID/fix_shake.cpp
+++ b/src/RIGID/fix_shake.cpp
@@ -207,8 +207,8 @@ FixShake::FixShake(LAMMPS *lmp, int narg, char **arg) :
 
   if (output_every) {
     int nb = atom->nbondtypes + 1;
-    b_count = new int[nb];
-    b_count_all = new int[nb];
+    b_count = new bigint[nb];
+    b_count_all = new bigint[nb];
     b_ave = new double[nb];
     b_ave_all = new double[nb];
     b_max = new double[nb];
@@ -217,8 +217,8 @@ FixShake::FixShake(LAMMPS *lmp, int narg, char **arg) :
     b_min_all = new double[nb];
 
     int na = atom->nangletypes + 1;
-    a_count = new int[na];
-    a_count_all = new int[na];
+    a_count = new bigint[na];
+    a_count_all = new bigint[na];
     a_ave = new double[na];
     a_ave_all = new double[na];
     a_max = new double[na];
@@ -755,7 +755,7 @@ void FixShake::min_post_force(int vflag)
    count # of degrees-of-freedom removed by SHAKE for atoms in igroup
 ------------------------------------------------------------------------- */
 
-int FixShake::dof(int igroup)
+bigint FixShake::dof(int igroup)
 {
   int groupbit = group->bitmask[igroup];
 
@@ -766,7 +766,7 @@ int FixShake::dof(int igroup)
   // count dof in a cluster if and only if
   // the central atom is in group and atom i is the central atom
 
-  int n = 0;
+  bigint n = 0;
   for (int i = 0; i < nlocal; i++) {
     if (!(mask[i] & groupbit)) continue;
     if (shake_flag[i] == 0) continue;
@@ -777,8 +777,8 @@ int FixShake::dof(int igroup)
     else if (shake_flag[i] == 4) n += 3;
   }
 
-  int nall;
-  MPI_Allreduce(&n,&nall,1,MPI_INT,MPI_SUM,world);
+  bigint nall;
+  MPI_Allreduce(&n,&nall,1,MPI_LMP_BIGINT,MPI_SUM,world);
   return nall;
 }
 
@@ -1098,7 +1098,7 @@ void FixShake::find_clusters()
   // print info on SHAKE clusters
   // -----------------------------------------------------
 
-  int count1,count2,count3,count4;
+  bigint count1,count2,count3,count4;
   count1 = count2 = count3 = count4 = 0;
   for (i = 0; i < nlocal; i++) {
     if (shake_flag[i] == 1) count1++;
@@ -1107,15 +1107,15 @@ void FixShake::find_clusters()
     else if (shake_flag[i] == 4) count4++;
   }
 
-  int tmp;
+  bigint tmp;
   tmp = count1;
-  MPI_Allreduce(&tmp,&count1,1,MPI_INT,MPI_SUM,world);
+  MPI_Allreduce(&tmp,&count1,1,MPI_LMP_BIGINT,MPI_SUM,world);
   tmp = count2;
-  MPI_Allreduce(&tmp,&count2,1,MPI_INT,MPI_SUM,world);
+  MPI_Allreduce(&tmp,&count2,1,MPI_LMP_BIGINT,MPI_SUM,world);
   tmp = count3;
-  MPI_Allreduce(&tmp,&count3,1,MPI_INT,MPI_SUM,world);
+  MPI_Allreduce(&tmp,&count3,1,MPI_LMP_BIGINT,MPI_SUM,world);
   tmp = count4;
-  MPI_Allreduce(&tmp,&count4,1,MPI_INT,MPI_SUM,world);
+  MPI_Allreduce(&tmp,&count4,1,MPI_LMP_BIGINT,MPI_SUM,world);
 
   if (comm->me == 0) {
     utils::logmesg(lmp,"{:>8} = # of size 2 clusters\n"
@@ -2682,12 +2682,12 @@ void FixShake::stats()
 
   // sum across all procs
 
-  MPI_Allreduce(b_count,b_count_all,nb,MPI_INT,MPI_SUM,world);
+  MPI_Allreduce(b_count,b_count_all,nb,MPI_LMP_BIGINT,MPI_SUM,world);
   MPI_Allreduce(b_ave,b_ave_all,nb,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(b_max,b_max_all,nb,MPI_DOUBLE,MPI_MAX,world);
   MPI_Allreduce(b_min,b_min_all,nb,MPI_DOUBLE,MPI_MIN,world);
 
-  MPI_Allreduce(a_count,a_count_all,na,MPI_INT,MPI_SUM,world);
+  MPI_Allreduce(a_count,a_count_all,na,MPI_LMP_BIGINT,MPI_SUM,world);
   MPI_Allreduce(a_ave,a_ave_all,na,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(a_max,a_max_all,na,MPI_DOUBLE,MPI_MAX,world);
   MPI_Allreduce(a_min,a_min_all,na,MPI_DOUBLE,MPI_MIN,world);
diff --git a/src/RIGID/fix_shake.h b/src/RIGID/fix_shake.h
index 3b04560f09..d02fdd784a 100644
--- a/src/RIGID/fix_shake.h
+++ b/src/RIGID/fix_shake.h
@@ -59,7 +59,7 @@ class FixShake : public Fix {
   virtual void correct_coordinates(int vflag);
   virtual void correct_velocities();
 
-  int dof(int) override;
+  bigint dof(int) override;
   void reset_dt() override;
   void *extract(const char *, int &) override;
   double compute_scalar() override;
@@ -117,10 +117,10 @@ class FixShake : public Fix {
   int nlist, maxlist;    // size and max-size of list
 
   // stat quantities
-  int *b_count, *b_count_all;                   // counts for each bond type, atoms in bond cluster
+  bigint *b_count, *b_count_all;                // counts for each bond type, atoms in bond cluster
   double *b_ave, *b_max, *b_min;                // ave/max/min dist for each bond type
   double *b_ave_all, *b_max_all, *b_min_all;    // MPI summing arrays
-  int *a_count, *a_count_all;                   // ditto for angle types
+  bigint *a_count, *a_count_all;                // ditto for angle types
   double *a_ave, *a_max, *a_min;
   double *a_ave_all, *a_max_all, *a_min_all;
 
diff --git a/src/atom.cpp b/src/atom.cpp
index b604c54e6b..c08df16614 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -26,6 +26,7 @@
 #include "input.h"
 #include "label_map.h"
 #include "math_const.h"
+#include "math_extra.h"
 #include "memory.h"
 #include "modify.h"
 #include "molecule.h"
@@ -2112,6 +2113,15 @@ std::vector<Molecule *>Atom::get_molecule_by_id(const std::string &id)
 void Atom::add_molecule_atom(Molecule *onemol, int iatom, int ilocal, tagint offset)
 {
   if (onemol->qflag && q_flag) q[ilocal] = onemol->q[iatom];
+  if (onemol->muflag && mu_flag) {
+    double r[3], rotmat[3][3];
+    MathExtra::quat_to_mat(onemol->quat_external, rotmat);
+    MathExtra::matvec(rotmat, onemol->mu[iatom], r);
+    mu[ilocal][0] = r[0];
+    mu[ilocal][1] = r[1];
+    mu[ilocal][2] = r[2];
+    mu[ilocal][3] = sqrt(r[0] * r[0] + r[1] * r[1] + r[2] * r[2]);
+  }
   if (onemol->radiusflag && radius_flag) radius[ilocal] = onemol->radius[iatom];
   if (onemol->rmassflag && rmass_flag) rmass[ilocal] = onemol->rmass[iatom];
   else if (rmass_flag)
diff --git a/src/compute.cpp b/src/compute.cpp
index 2bd1544fd7..d47d1d5292 100644
--- a/src/compute.cpp
+++ b/src/compute.cpp
@@ -83,7 +83,7 @@ Compute::Compute(LAMMPS *lmp, int narg, char **arg) :
 
   extra_dof = domain->dimension;
   dynamic_user = 0;
-  fix_dof = 0;
+  fix_dof = 0.0;
 
   // setup list of timesteps
 
diff --git a/src/compute.h b/src/compute.h
index 8ae01a4469..6956c3ae99 100644
--- a/src/compute.h
+++ b/src/compute.h
@@ -178,7 +178,7 @@ class Compute : protected Pointers {
 
   double natoms_temp;    // # of atoms used for temperature calculation
   double extra_dof;      // extra DOF for temperature computes
-  int fix_dof;           // DOF due to fixes
+  double fix_dof;        // DOF due to fixes
   int dynamic;           // recount atoms for temperature computes
   int dynamic_user;      // user request for temp compute to be dynamic
 
diff --git a/src/compute_pair.cpp b/src/compute_pair.cpp
index e789adbc89..1cb22a006f 100644
--- a/src/compute_pair.cpp
+++ b/src/compute_pair.cpp
@@ -75,7 +75,7 @@ ComputePair::ComputePair(LAMMPS *lmp, int narg, char **arg) :
     pair = force->pair_match(pstyle, 1, nsub);
   }
 
-  if (!pair) error->all(FLERR, "Unrecognized pair style {} in compute pair command", pstyle);
+  if (!pair) error->all(FLERR, "Unused pair style {} in compute pair command", pstyle);
   npair = pair->nextra;
 
   if (npair) {
diff --git a/src/displace_atoms.cpp b/src/displace_atoms.cpp
index fa333f1bc2..5ecf5a2c9e 100644
--- a/src/displace_atoms.cpp
+++ b/src/displace_atoms.cpp
@@ -160,7 +160,7 @@ void DisplaceAtoms::command(int narg, char **arg)
     int *mask = atom->mask;
     int nlocal = atom->nlocal;
 
-    double fraction,dramp;
+    double fraction, dramp;
 
     for (i = 0; i < nlocal; i++) {
       if (mask[i] & groupbit) {
@@ -255,11 +255,12 @@ void DisplaceAtoms::command(int narg, char **arg)
     int line_flag = atom->line_flag;
     int tri_flag = atom->tri_flag;
     int body_flag = atom->body_flag;
+    int quat_atom_flag = atom->quat_flag;
 
     int theta_flag = 0;
     int quat_flag = 0;
     if (line_flag) theta_flag = 1;
-    if (ellipsoid_flag || tri_flag || body_flag) quat_flag = 1;
+    if (ellipsoid_flag || tri_flag || body_flag || quat_atom_flag) quat_flag = 1;
 
     // AtomVec pointers to retrieve per-atom storage of extra quantities
 
@@ -269,6 +270,7 @@ void DisplaceAtoms::command(int narg, char **arg)
     auto avec_body = dynamic_cast<AtomVecBody *>(atom->style_match("body"));
 
     double **x = atom->x;
+    double **quat_atom = atom->quat;
     int *ellipsoid = atom->ellipsoid;
     int *line = atom->line;
     int *tri = atom->tri;
@@ -313,7 +315,7 @@ void DisplaceAtoms::command(int narg, char **arg)
 
         // quats for ellipsoids, tris, and bodies
 
-        if (quat_flag) {
+        if (quat_flag && !quat_atom_flag) {
           quat = nullptr;
           if (ellipsoid_flag && ellipsoid[i] >= 0)
             quat = avec_ellipsoid->bonus[ellipsoid[i]].quat;
@@ -322,12 +324,18 @@ void DisplaceAtoms::command(int narg, char **arg)
           else if (body_flag && body[i] >= 0)
             quat = avec_body->bonus[body[i]].quat;
           if (quat) {
-            MathExtra::quatquat(qrotate,quat,qnew);
+            MathExtra::quatquat(qrotate, quat, qnew);
             quat[0] = qnew[0];
             quat[1] = qnew[1];
             quat[2] = qnew[2];
             quat[3] = qnew[3];
           }
+        } else if (quat_atom_flag) {
+          MathExtra::quatquat(qrotate, quat_atom[i], qnew);
+          quat_atom[i][0] = qnew[0];
+          quat_atom[i][1] = qnew[1];
+          quat_atom[i][2] = qnew[2];
+          quat_atom[i][3] = qnew[3];
         }
       }
     }
diff --git a/src/fix.h b/src/fix.h
index 9b595f0c60..ca0a1ef84b 100644
--- a/src/fix.h
+++ b/src/fix.h
@@ -99,8 +99,8 @@ class Fix : protected Pointers {
   int size_local_cols;    // 0 = vector, N = columns in local array
   int local_freq;         // frequency local data is available at
 
-  int pergrid_flag;       // 0/1 if per-grid data is stored
-  int pergrid_freq;       // frequency per-grid data is available at
+  int pergrid_flag;    // 0/1 if per-grid data is stored
+  int pergrid_freq;    // frequency per-grid data is available at
 
   int extscalar;    // 0/1 if global scalar is intensive/extensive
   int extvector;    // 0/1/-1 if global vector is all int/ext/extlist
@@ -129,11 +129,11 @@ class Fix : protected Pointers {
 
   // KOKKOS flags and variables
 
-  int kokkosable;             // 1 if Kokkos fix
-  int forward_comm_device;    // 1 if forward comm on Device
-  int exchange_comm_device;   // 1 if exchange comm on Device
-  int fuse_integrate_flag;    // 1 if can fuse initial integrate with final integrate
-  int sort_device;            // 1 if sort on Device
+  int kokkosable;              // 1 if Kokkos fix
+  int forward_comm_device;     // 1 if forward comm on Device
+  int exchange_comm_device;    // 1 if exchange comm on Device
+  int fuse_integrate_flag;     // 1 if can fuse initial integrate with final integrate
+  int sort_device;             // 1 if sort on Device
   ExecutionSpace execution_space;
   unsigned int datamask_read, datamask_modify;
 
@@ -223,7 +223,7 @@ class Fix : protected Pointers {
   virtual void unpack_reverse_grid(int, void *, int, int *){};
   virtual void pack_remap_grid(int, void *, int, int *){};
   virtual void unpack_remap_grid(int, void *, int, int *){};
-  virtual int unpack_read_grid(int, char *) {return 0;};
+  virtual int unpack_read_grid(int, char *) { return 0; };
   virtual void pack_write_grid(int, void *){};
   virtual void unpack_write_grid(int, void *, int *){};
 
@@ -236,7 +236,7 @@ class Fix : protected Pointers {
   virtual double compute_vector(int) { return 0.0; }
   virtual double compute_array(int, int) { return 0.0; }
 
-  virtual int dof(int) { return 0; }
+  virtual bigint dof(int) { return 0; }
   virtual void deform(int) {}
   virtual void reset_target(double) {}
   virtual void reset_dt() {}
diff --git a/src/fix_move.cpp b/src/fix_move.cpp
index 36bba410fc..53009495b1 100644
--- a/src/fix_move.cpp
+++ b/src/fix_move.cpp
@@ -276,10 +276,11 @@ FixMove::FixMove(LAMMPS *lmp, int narg, char **arg) :
   line_flag = atom->line_flag;
   tri_flag = atom->tri_flag;
   body_flag = atom->body_flag;
+  quat_atom_flag = atom->quat_flag;
 
   theta_flag = quat_flag = 0;
   if (line_flag) theta_flag = 1;
-  if (ellipsoid_flag || tri_flag || body_flag) quat_flag = 1;
+  if (ellipsoid_flag || tri_flag || body_flag || quat_atom_flag) quat_flag = 1;
 
   extra_flag = 0;
   if (omega_flag || angmom_flag || theta_flag || quat_flag) extra_flag = 1;
@@ -329,7 +330,7 @@ FixMove::FixMove(LAMMPS *lmp, int narg, char **arg) :
     }
   }
 
-  if (quat_flag) {
+  if (quat_flag && !quat_atom_flag) {
     double *quat;
     for (int i = 0; i < nlocal; i++) {
       quat = nullptr;
@@ -349,6 +350,16 @@ FixMove::FixMove(LAMMPS *lmp, int narg, char **arg) :
       } else
         qoriginal[i][0] = qoriginal[i][1] = qoriginal[i][2] = qoriginal[i][3] = 0.0;
     }
+  } else if (quat_atom_flag) {
+    double **quat_atom = atom->quat;
+    for (int i = 0; i < nlocal; i++) {
+      if (mask[i] & groupbit) {
+        qoriginal[i][0] = quat_atom[i][0];
+        qoriginal[i][1] = quat_atom[i][1];
+        qoriginal[i][2] = quat_atom[i][2];
+        qoriginal[i][3] = quat_atom[i][3];
+      }
+    }
   }
 
   // nrestart = size of per-atom restart data
@@ -521,6 +532,7 @@ void FixMove::initial_integrate(int /*vflag*/)
   double *radius = atom->radius;
   double *rmass = atom->rmass;
   double *mass = atom->mass;
+  double **quat_atom = atom->quat;
   int *type = atom->type;
   int *ellipsoid = atom->ellipsoid;
   int *line = atom->line;
@@ -749,9 +761,9 @@ void FixMove::initial_integrate(int /*vflag*/)
             avec_line->bonus[atom->line[i]].theta = theta_new;
           }
 
-          // quats for ellipsoids, tris, and bodies
+          // quats for ellipsoids, tris, bodies, and bpm/sphere
 
-          if (quat_flag) {
+          if (quat_flag && !quat_atom_flag) {
             quat = nullptr;
             if (ellipsoid_flag && ellipsoid[i] >= 0)
               quat = avec_ellipsoid->bonus[ellipsoid[i]].quat;
@@ -760,6 +772,8 @@ void FixMove::initial_integrate(int /*vflag*/)
             else if (body_flag && body[i] >= 0)
               quat = avec_body->bonus[body[i]].quat;
             if (quat) MathExtra::quatquat(qrotate, qoriginal[i], quat);
+          } else if (quat_atom_flag) {
+            MathExtra::quatquat(qrotate, qoriginal[i], quat_atom[i]);
           }
         }
 
@@ -880,9 +894,9 @@ void FixMove::initial_integrate(int /*vflag*/)
             avec_line->bonus[atom->line[i]].theta = theta_new;
           }
 
-          // quats for ellipsoids, tris, and bodies
+          // quats for ellipsoids, tris, bodies, and bpm/sphere
 
-          if (quat_flag) {
+          if (quat_flag && !quat_atom_flag) {
             quat = nullptr;
             if (ellipsoid_flag && ellipsoid[i] >= 0)
               quat = avec_ellipsoid->bonus[ellipsoid[i]].quat;
@@ -891,6 +905,8 @@ void FixMove::initial_integrate(int /*vflag*/)
             else if (body_flag && body[i] >= 0)
               quat = avec_body->bonus[body[i]].quat;
             if (quat) MathExtra::quatquat(qrotate, qoriginal[i], quat);
+          } else if (quat_atom_flag) {
+            MathExtra::quatquat(qrotate, qoriginal[i], quat_atom[i]);
           }
         }
 
@@ -1341,9 +1357,9 @@ void FixMove::set_arrays(int i)
         toriginal[i] = theta - 0.0;    // NOTE: edit this line
       }
 
-      // quats for ellipsoids, tris, and bodies
+      // quats for ellipsoids, tris, bodies, and bpm/sphere
 
-      if (quat_flag) {
+      if (quat_flag & !quat_atom_flag) {
         quat = nullptr;
         if (ellipsoid_flag && ellipsoid[i] >= 0)
           quat = avec_ellipsoid->bonus[ellipsoid[i]].quat;
@@ -1354,6 +1370,12 @@ void FixMove::set_arrays(int i)
         if (quat) {
           // qoriginal = f(quat,-delta);   // NOTE: edit this line
         }
+      } else if (quat_atom_flag) {
+        // double **quat_atom = atom->quat;
+        // qoriginal[0] = quat_atom[i][0]; // NOTE: edit this line
+        // qoriginal[1] = quat_atom[i][1]; // NOTE: edit this line
+        // qoriginal[2] = quat_atom[i][2]; // NOTE: edit this line
+        // qoriginal[3] = quat_atom[i][3]; // NOTE: edit this line
       }
     }
     xoriginal[i][0] -= vx * delta;
@@ -1400,7 +1422,7 @@ void FixMove::set_arrays(int i)
 
       // quats for ellipsoids, tris, and bodies
 
-      if (quat_flag) {
+      if (quat_flag && !quat_atom_flag) {
         quat = nullptr;
         if (ellipsoid_flag && ellipsoid[i] >= 0)
           quat = avec_ellipsoid->bonus[ellipsoid[i]].quat;
@@ -1411,6 +1433,12 @@ void FixMove::set_arrays(int i)
         if (quat) {
           // qoriginal = f(quat,-delta);   // NOTE: edit this line
         }
+      } else if (quat_atom_flag) {
+        // double **quat_atom = atom->quat;
+        // qoriginal[0] = quat_atom[i][0]; // NOTE: edit this line
+        // qoriginal[1] = quat_atom[i][1]; // NOTE: edit this line
+        // qoriginal[2] = quat_atom[i][2]; // NOTE: edit this line
+        // qoriginal[3] = quat_atom[i][3]; // NOTE: edit this line
       }
     }
   }
diff --git a/src/fix_move.h b/src/fix_move.h
index e3c018f54d..244a9d704a 100644
--- a/src/fix_move.h
+++ b/src/fix_move.h
@@ -61,7 +61,7 @@ class FixMove : public Fix {
   int xvar, yvar, zvar, vxvar, vyvar, vzvar;
   int xvarstyle, yvarstyle, zvarstyle, vxvarstyle, vyvarstyle, vzvarstyle;
   int extra_flag, omega_flag, angmom_flag;
-  int radius_flag, ellipsoid_flag, line_flag, tri_flag, body_flag;
+  int radius_flag, ellipsoid_flag, line_flag, tri_flag, body_flag, quat_atom_flag;
   int theta_flag, quat_flag;
   int nlevels_respa, nrestart;
   int time_origin;
diff --git a/src/fmt/args.h b/src/fmt/args.h
index 2d684e7cc1..b77a2d0661 100644
--- a/src/fmt/args.h
+++ b/src/fmt/args.h
@@ -12,7 +12,7 @@
 #include <memory>      // std::unique_ptr
 #include <vector>
 
-#include "core.h"
+#include "format.h"  // std_string_view
 
 FMT_BEGIN_NAMESPACE
 
@@ -22,8 +22,9 @@ template <typename T> struct is_reference_wrapper : std::false_type {};
 template <typename T>
 struct is_reference_wrapper<std::reference_wrapper<T>> : std::true_type {};
 
-template <typename T> const T& unwrap(const T& v) { return v; }
-template <typename T> const T& unwrap(const std::reference_wrapper<T>& v) {
+template <typename T> auto unwrap(const T& v) -> const T& { return v; }
+template <typename T>
+auto unwrap(const std::reference_wrapper<T>& v) -> const T& {
   return static_cast<const T&>(v);
 }
 
@@ -50,7 +51,7 @@ class dynamic_arg_list {
   std::unique_ptr<node<>> head_;
 
  public:
-  template <typename T, typename Arg> const T& push(const Arg& arg) {
+  template <typename T, typename Arg> auto push(const Arg& arg) -> const T& {
     auto new_node = std::unique_ptr<typed_node<T>>(new typed_node<T>(arg));
     auto& value = new_node->value;
     new_node->next = std::move(head_);
@@ -110,14 +111,14 @@ class dynamic_format_arg_store
 
   friend class basic_format_args<Context>;
 
-  unsigned long long get_types() const {
+  auto get_types() const -> unsigned long long {
     return detail::is_unpacked_bit | data_.size() |
            (named_info_.empty()
                 ? 0ULL
                 : static_cast<unsigned long long>(detail::has_named_args_bit));
   }
 
-  const basic_format_arg<Context>* data() const {
+  auto data() const -> const basic_format_arg<Context>* {
     return named_info_.empty() ? data_.data() : data_.data() + 1;
   }
 
diff --git a/src/fmt/chrono.h b/src/fmt/chrono.h
index ff3e1445b9..9d54574e16 100644
--- a/src/fmt/chrono.h
+++ b/src/fmt/chrono.h
@@ -18,7 +18,7 @@
 #include <ostream>
 #include <type_traits>
 
-#include "format.h"
+#include "ostream.h"  // formatbuf
 
 FMT_BEGIN_NAMESPACE
 
@@ -72,7 +72,8 @@ template <typename To, typename From,
           FMT_ENABLE_IF(!std::is_same<From, To>::value &&
                         std::numeric_limits<From>::is_signed ==
                             std::numeric_limits<To>::is_signed)>
-FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
+FMT_CONSTEXPR auto lossless_integral_conversion(const From from, int& ec)
+    -> To {
   ec = 0;
   using F = std::numeric_limits<From>;
   using T = std::numeric_limits<To>;
@@ -101,7 +102,8 @@ template <typename To, typename From,
           FMT_ENABLE_IF(!std::is_same<From, To>::value &&
                         std::numeric_limits<From>::is_signed !=
                             std::numeric_limits<To>::is_signed)>
-FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
+FMT_CONSTEXPR auto lossless_integral_conversion(const From from, int& ec)
+    -> To {
   ec = 0;
   using F = std::numeric_limits<From>;
   using T = std::numeric_limits<To>;
@@ -133,7 +135,8 @@ FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
 
 template <typename To, typename From,
           FMT_ENABLE_IF(std::is_same<From, To>::value)>
-FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
+FMT_CONSTEXPR auto lossless_integral_conversion(const From from, int& ec)
+    -> To {
   ec = 0;
   return from;
 }  // function
@@ -154,7 +157,7 @@ FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
 // clang-format on
 template <typename To, typename From,
           FMT_ENABLE_IF(!std::is_same<From, To>::value)>
-FMT_CONSTEXPR To safe_float_conversion(const From from, int& ec) {
+FMT_CONSTEXPR auto safe_float_conversion(const From from, int& ec) -> To {
   ec = 0;
   using T = std::numeric_limits<To>;
   static_assert(std::is_floating_point<From>::value, "From must be floating");
@@ -176,7 +179,7 @@ FMT_CONSTEXPR To safe_float_conversion(const From from, int& ec) {
 
 template <typename To, typename From,
           FMT_ENABLE_IF(std::is_same<From, To>::value)>
-FMT_CONSTEXPR To safe_float_conversion(const From from, int& ec) {
+FMT_CONSTEXPR auto safe_float_conversion(const From from, int& ec) -> To {
   ec = 0;
   static_assert(std::is_floating_point<From>::value, "From must be floating");
   return from;
@@ -188,8 +191,8 @@ FMT_CONSTEXPR To safe_float_conversion(const From from, int& ec) {
 template <typename To, typename FromRep, typename FromPeriod,
           FMT_ENABLE_IF(std::is_integral<FromRep>::value),
           FMT_ENABLE_IF(std::is_integral<typename To::rep>::value)>
-To safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
-                      int& ec) {
+auto safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
+                        int& ec) -> To {
   using From = std::chrono::duration<FromRep, FromPeriod>;
   ec = 0;
   // the basic idea is that we need to convert from count() in the from type
@@ -240,8 +243,8 @@ To safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
 template <typename To, typename FromRep, typename FromPeriod,
           FMT_ENABLE_IF(std::is_floating_point<FromRep>::value),
           FMT_ENABLE_IF(std::is_floating_point<typename To::rep>::value)>
-To safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
-                      int& ec) {
+auto safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
+                        int& ec) -> To {
   using From = std::chrono::duration<FromRep, FromPeriod>;
   ec = 0;
   if (std::isnan(from.count())) {
@@ -321,12 +324,12 @@ To safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
 
 namespace detail {
 template <typename T = void> struct null {};
-inline null<> localtime_r FMT_NOMACRO(...) { return null<>(); }
-inline null<> localtime_s(...) { return null<>(); }
-inline null<> gmtime_r(...) { return null<>(); }
-inline null<> gmtime_s(...) { return null<>(); }
+inline auto localtime_r FMT_NOMACRO(...) -> null<> { return null<>(); }
+inline auto localtime_s(...) -> null<> { return null<>(); }
+inline auto gmtime_r(...) -> null<> { return null<>(); }
+inline auto gmtime_s(...) -> null<> { return null<>(); }
 
-inline const std::locale& get_classic_locale() {
+inline auto get_classic_locale() -> const std::locale& {
   static const auto& locale = std::locale::classic();
   return locale;
 }
@@ -336,8 +339,6 @@ template <typename CodeUnit> struct codecvt_result {
   CodeUnit buf[max_size];
   CodeUnit* end;
 };
-template <typename CodeUnit>
-constexpr const size_t codecvt_result<CodeUnit>::max_size;
 
 template <typename CodeUnit>
 void write_codecvt(codecvt_result<CodeUnit>& out, string_view in_buf,
@@ -408,8 +409,7 @@ inline void do_write(buffer<Char>& buf, const std::tm& time,
   auto&& format_buf = formatbuf<std::basic_streambuf<Char>>(buf);
   auto&& os = std::basic_ostream<Char>(&format_buf);
   os.imbue(loc);
-  using iterator = std::ostreambuf_iterator<Char>;
-  const auto& facet = std::use_facet<std::time_put<Char, iterator>>(loc);
+  const auto& facet = std::use_facet<std::time_put<Char>>(loc);
   auto end = facet.put(os, os, Char(' '), &time, format, modifier);
   if (end.failed()) FMT_THROW(format_error("failed to format time"));
 }
@@ -432,6 +432,51 @@ auto write(OutputIt out, const std::tm& time, const std::locale& loc,
   return write_encoded_tm_str(out, string_view(buf.data(), buf.size()), loc);
 }
 
+template <typename Rep1, typename Rep2>
+struct is_same_arithmetic_type
+    : public std::integral_constant<bool,
+                                    (std::is_integral<Rep1>::value &&
+                                     std::is_integral<Rep2>::value) ||
+                                        (std::is_floating_point<Rep1>::value &&
+                                         std::is_floating_point<Rep2>::value)> {
+};
+
+template <
+    typename To, typename FromRep, typename FromPeriod,
+    FMT_ENABLE_IF(is_same_arithmetic_type<FromRep, typename To::rep>::value)>
+auto fmt_duration_cast(std::chrono::duration<FromRep, FromPeriod> from) -> To {
+#if FMT_SAFE_DURATION_CAST
+  // Throwing version of safe_duration_cast is only available for
+  // integer to integer or float to float casts.
+  int ec;
+  To to = safe_duration_cast::safe_duration_cast<To>(from, ec);
+  if (ec) FMT_THROW(format_error("cannot format duration"));
+  return to;
+#else
+  // Standard duration cast, may overflow.
+  return std::chrono::duration_cast<To>(from);
+#endif
+}
+
+template <
+    typename To, typename FromRep, typename FromPeriod,
+    FMT_ENABLE_IF(!is_same_arithmetic_type<FromRep, typename To::rep>::value)>
+auto fmt_duration_cast(std::chrono::duration<FromRep, FromPeriod> from) -> To {
+  // Mixed integer <-> float cast is not supported by safe_duration_cast.
+  return std::chrono::duration_cast<To>(from);
+}
+
+template <typename Duration>
+auto to_time_t(
+    std::chrono::time_point<std::chrono::system_clock, Duration> time_point)
+    -> std::time_t {
+  // Cannot use std::chrono::system_clock::to_time_t since this would first
+  // require a cast to std::chrono::system_clock::time_point, which could
+  // overflow.
+  return fmt_duration_cast<std::chrono::duration<std::time_t>>(
+             time_point.time_since_epoch())
+      .count();
+}
 }  // namespace detail
 
 FMT_BEGIN_EXPORT
@@ -441,29 +486,29 @@ FMT_BEGIN_EXPORT
   expressed in local time. Unlike ``std::localtime``, this function is
   thread-safe on most platforms.
  */
-inline std::tm localtime(std::time_t time) {
+inline auto localtime(std::time_t time) -> std::tm {
   struct dispatcher {
     std::time_t time_;
     std::tm tm_;
 
     dispatcher(std::time_t t) : time_(t) {}
 
-    bool run() {
+    auto run() -> bool {
       using namespace fmt::detail;
       return handle(localtime_r(&time_, &tm_));
     }
 
-    bool handle(std::tm* tm) { return tm != nullptr; }
+    auto handle(std::tm* tm) -> bool { return tm != nullptr; }
 
-    bool handle(detail::null<>) {
+    auto handle(detail::null<>) -> bool {
       using namespace fmt::detail;
       return fallback(localtime_s(&tm_, &time_));
     }
 
-    bool fallback(int res) { return res == 0; }
+    auto fallback(int res) -> bool { return res == 0; }
 
 #if !FMT_MSC_VERSION
-    bool fallback(detail::null<>) {
+    auto fallback(detail::null<>) -> bool {
       using namespace fmt::detail;
       std::tm* tm = std::localtime(&time_);
       if (tm) tm_ = *tm;
@@ -480,8 +525,8 @@ inline std::tm localtime(std::time_t time) {
 #if FMT_USE_LOCAL_TIME
 template <typename Duration>
 inline auto localtime(std::chrono::local_time<Duration> time) -> std::tm {
-  return localtime(std::chrono::system_clock::to_time_t(
-      std::chrono::current_zone()->to_sys(time)));
+  return localtime(
+      detail::to_time_t(std::chrono::current_zone()->to_sys(time)));
 }
 #endif
 
@@ -490,29 +535,29 @@ inline auto localtime(std::chrono::local_time<Duration> time) -> std::tm {
   expressed in Coordinated Universal Time (UTC). Unlike ``std::gmtime``, this
   function is thread-safe on most platforms.
  */
-inline std::tm gmtime(std::time_t time) {
+inline auto gmtime(std::time_t time) -> std::tm {
   struct dispatcher {
     std::time_t time_;
     std::tm tm_;
 
     dispatcher(std::time_t t) : time_(t) {}
 
-    bool run() {
+    auto run() -> bool {
       using namespace fmt::detail;
       return handle(gmtime_r(&time_, &tm_));
     }
 
-    bool handle(std::tm* tm) { return tm != nullptr; }
+    auto handle(std::tm* tm) -> bool { return tm != nullptr; }
 
-    bool handle(detail::null<>) {
+    auto handle(detail::null<>) -> bool {
       using namespace fmt::detail;
       return fallback(gmtime_s(&tm_, &time_));
     }
 
-    bool fallback(int res) { return res == 0; }
+    auto fallback(int res) -> bool { return res == 0; }
 
 #if !FMT_MSC_VERSION
-    bool fallback(detail::null<>) {
+    auto fallback(detail::null<>) -> bool {
       std::tm* tm = std::gmtime(&time_);
       if (tm) tm_ = *tm;
       return tm != nullptr;
@@ -525,9 +570,11 @@ inline std::tm gmtime(std::time_t time) {
   return gt.tm_;
 }
 
-inline std::tm gmtime(
-    std::chrono::time_point<std::chrono::system_clock> time_point) {
-  return gmtime(std::chrono::system_clock::to_time_t(time_point));
+template <typename Duration>
+inline auto gmtime(
+    std::chrono::time_point<std::chrono::system_clock, Duration> time_point)
+    -> std::tm {
+  return gmtime(detail::to_time_t(time_point));
 }
 
 namespace detail {
@@ -566,7 +613,8 @@ inline void write_digit2_separated(char* buf, unsigned a, unsigned b,
   }
 }
 
-template <typename Period> FMT_CONSTEXPR inline const char* get_units() {
+template <typename Period>
+FMT_CONSTEXPR inline auto get_units() -> const char* {
   if (std::is_same<Period, std::atto>::value) return "as";
   if (std::is_same<Period, std::femto>::value) return "fs";
   if (std::is_same<Period, std::pico>::value) return "ps";
@@ -584,8 +632,9 @@ template <typename Period> FMT_CONSTEXPR inline const char* get_units() {
   if (std::is_same<Period, std::tera>::value) return "Ts";
   if (std::is_same<Period, std::peta>::value) return "Ps";
   if (std::is_same<Period, std::exa>::value) return "Es";
-  if (std::is_same<Period, std::ratio<60>>::value) return "m";
+  if (std::is_same<Period, std::ratio<60>>::value) return "min";
   if (std::is_same<Period, std::ratio<3600>>::value) return "h";
+  if (std::is_same<Period, std::ratio<86400>>::value) return "d";
   return nullptr;
 }
 
@@ -621,9 +670,8 @@ auto write_padding(OutputIt out, pad_type pad) -> OutputIt {
 
 // Parses a put_time-like format string and invokes handler actions.
 template <typename Char, typename Handler>
-FMT_CONSTEXPR const Char* parse_chrono_format(const Char* begin,
-                                              const Char* end,
-                                              Handler&& handler) {
+FMT_CONSTEXPR auto parse_chrono_format(const Char* begin, const Char* end,
+                                       Handler&& handler) -> const Char* {
   if (begin == end || *begin == '}') return begin;
   if (*begin != '%') FMT_THROW(format_error("invalid format"));
   auto ptr = begin;
@@ -954,25 +1002,25 @@ struct tm_format_checker : null_chrono_spec_handler<tm_format_checker> {
   FMT_CONSTEXPR void on_tz_name() {}
 };
 
-inline const char* tm_wday_full_name(int wday) {
+inline auto tm_wday_full_name(int wday) -> const char* {
   static constexpr const char* full_name_list[] = {
       "Sunday",   "Monday", "Tuesday", "Wednesday",
       "Thursday", "Friday", "Saturday"};
   return wday >= 0 && wday <= 6 ? full_name_list[wday] : "?";
 }
-inline const char* tm_wday_short_name(int wday) {
+inline auto tm_wday_short_name(int wday) -> const char* {
   static constexpr const char* short_name_list[] = {"Sun", "Mon", "Tue", "Wed",
                                                     "Thu", "Fri", "Sat"};
   return wday >= 0 && wday <= 6 ? short_name_list[wday] : "???";
 }
 
-inline const char* tm_mon_full_name(int mon) {
+inline auto tm_mon_full_name(int mon) -> const char* {
   static constexpr const char* full_name_list[] = {
       "January", "February", "March",     "April",   "May",      "June",
       "July",    "August",   "September", "October", "November", "December"};
   return mon >= 0 && mon <= 11 ? full_name_list[mon] : "?";
 }
-inline const char* tm_mon_short_name(int mon) {
+inline auto tm_mon_short_name(int mon) -> const char* {
   static constexpr const char* short_name_list[] = {
       "Jan", "Feb", "Mar", "Apr", "May", "Jun",
       "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
@@ -1004,21 +1052,21 @@ inline void tzset_once() {
 
 // Converts value to Int and checks that it's in the range [0, upper).
 template <typename T, typename Int, FMT_ENABLE_IF(std::is_integral<T>::value)>
-inline Int to_nonnegative_int(T value, Int upper) {
-  FMT_ASSERT(std::is_unsigned<Int>::value ||
-                 (value >= 0 && to_unsigned(value) <= to_unsigned(upper)),
-             "invalid value");
-  (void)upper;
+inline auto to_nonnegative_int(T value, Int upper) -> Int {
+  if (!std::is_unsigned<Int>::value &&
+      (value < 0 || to_unsigned(value) > to_unsigned(upper))) {
+    FMT_THROW(fmt::format_error("chrono value is out of range"));
+  }
   return static_cast<Int>(value);
 }
 template <typename T, typename Int, FMT_ENABLE_IF(!std::is_integral<T>::value)>
-inline Int to_nonnegative_int(T value, Int upper) {
+inline auto to_nonnegative_int(T value, Int upper) -> Int {
   if (value < 0 || value > static_cast<T>(upper))
     FMT_THROW(format_error("invalid value"));
   return static_cast<Int>(value);
 }
 
-constexpr long long pow10(std::uint32_t n) {
+constexpr auto pow10(std::uint32_t n) -> long long {
   return n == 0 ? 1 : 10 * pow10(n - 1);
 }
 
@@ -1052,13 +1100,12 @@ void write_fractional_seconds(OutputIt& out, Duration d, int precision = -1) {
                                 std::chrono::seconds::rep>::type,
       std::ratio<1, detail::pow10(num_fractional_digits)>>;
 
-  const auto fractional =
-      d - std::chrono::duration_cast<std::chrono::seconds>(d);
+  const auto fractional = d - fmt_duration_cast<std::chrono::seconds>(d);
   const auto subseconds =
       std::chrono::treat_as_floating_point<
           typename subsecond_precision::rep>::value
           ? fractional.count()
-          : std::chrono::duration_cast<subsecond_precision>(fractional).count();
+          : fmt_duration_cast<subsecond_precision>(fractional).count();
   auto n = static_cast<uint32_or_64_or_128_t<long long>>(subseconds);
   const int num_digits = detail::count_digits(n);
 
@@ -1109,11 +1156,11 @@ void write_floating_seconds(memory_buffer& buf, Duration duration,
       num_fractional_digits = 6;
   }
 
-  format_to(std::back_inserter(buf), FMT_STRING("{:.{}f}"),
-            std::fmod(val * static_cast<rep>(Duration::period::num) /
-                          static_cast<rep>(Duration::period::den),
-                      static_cast<rep>(60)),
-            num_fractional_digits);
+  fmt::format_to(std::back_inserter(buf), FMT_STRING("{:.{}f}"),
+                 std::fmod(val * static_cast<rep>(Duration::period::num) /
+                               static_cast<rep>(Duration::period::den),
+                           static_cast<rep>(60)),
+                 num_fractional_digits);
 }
 
 template <typename OutputIt, typename Char,
@@ -1174,8 +1221,7 @@ class tm_writer {
     return static_cast<int>(l);
   }
 
-  // Algorithm:
-  // https://en.wikipedia.org/wiki/ISO_week_date#Calculating_the_week_number_from_a_month_and_day_of_the_month_or_ordinal_date
+  // Algorithm: https://en.wikipedia.org/wiki/ISO_week_date.
   auto iso_year_weeks(long long curr_year) const noexcept -> int {
     const auto prev_year = curr_year - 1;
     const auto curr_p =
@@ -1315,7 +1361,7 @@ class tm_writer {
         subsecs_(subsecs),
         tm_(tm) {}
 
-  OutputIt out() const { return out_; }
+  auto out() const -> OutputIt { return out_; }
 
   FMT_CONSTEXPR void on_text(const Char* begin, const Char* end) {
     out_ = copy_str<Char>(begin, end, out_);
@@ -1579,6 +1625,7 @@ struct chrono_format_checker : null_chrono_spec_handler<chrono_format_checker> {
 
   template <typename Char>
   FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
+  FMT_CONSTEXPR void on_day_of_year() {}
   FMT_CONSTEXPR void on_24_hour(numeric_system, pad_type) {}
   FMT_CONSTEXPR void on_12_hour(numeric_system, pad_type) {}
   FMT_CONSTEXPR void on_minute(numeric_system, pad_type) {}
@@ -1597,16 +1644,16 @@ struct chrono_format_checker : null_chrono_spec_handler<chrono_format_checker> {
 
 template <typename T,
           FMT_ENABLE_IF(std::is_integral<T>::value&& has_isfinite<T>::value)>
-inline bool isfinite(T) {
+inline auto isfinite(T) -> bool {
   return true;
 }
 
 template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
-inline T mod(T x, int y) {
+inline auto mod(T x, int y) -> T {
   return x % static_cast<T>(y);
 }
 template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
-inline T mod(T x, int y) {
+inline auto mod(T x, int y) -> T {
   return std::fmod(x, static_cast<T>(y));
 }
 
@@ -1621,49 +1668,38 @@ template <typename T> struct make_unsigned_or_unchanged<T, true> {
   using type = typename std::make_unsigned<T>::type;
 };
 
-#if FMT_SAFE_DURATION_CAST
-// throwing version of safe_duration_cast
-template <typename To, typename FromRep, typename FromPeriod>
-To fmt_safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from) {
-  int ec;
-  To to = safe_duration_cast::safe_duration_cast<To>(from, ec);
-  if (ec) FMT_THROW(format_error("cannot format duration"));
-  return to;
-}
-#endif
-
 template <typename Rep, typename Period,
           FMT_ENABLE_IF(std::is_integral<Rep>::value)>
-inline std::chrono::duration<Rep, std::milli> get_milliseconds(
-    std::chrono::duration<Rep, Period> d) {
+inline auto get_milliseconds(std::chrono::duration<Rep, Period> d)
+    -> std::chrono::duration<Rep, std::milli> {
   // this may overflow and/or the result may not fit in the
   // target type.
 #if FMT_SAFE_DURATION_CAST
   using CommonSecondsType =
       typename std::common_type<decltype(d), std::chrono::seconds>::type;
-  const auto d_as_common = fmt_safe_duration_cast<CommonSecondsType>(d);
+  const auto d_as_common = fmt_duration_cast<CommonSecondsType>(d);
   const auto d_as_whole_seconds =
-      fmt_safe_duration_cast<std::chrono::seconds>(d_as_common);
+      fmt_duration_cast<std::chrono::seconds>(d_as_common);
   // this conversion should be nonproblematic
   const auto diff = d_as_common - d_as_whole_seconds;
   const auto ms =
-      fmt_safe_duration_cast<std::chrono::duration<Rep, std::milli>>(diff);
+      fmt_duration_cast<std::chrono::duration<Rep, std::milli>>(diff);
   return ms;
 #else
-  auto s = std::chrono::duration_cast<std::chrono::seconds>(d);
-  return std::chrono::duration_cast<std::chrono::milliseconds>(d - s);
+  auto s = fmt_duration_cast<std::chrono::seconds>(d);
+  return fmt_duration_cast<std::chrono::milliseconds>(d - s);
 #endif
 }
 
 template <typename Char, typename Rep, typename OutputIt,
           FMT_ENABLE_IF(std::is_integral<Rep>::value)>
-OutputIt format_duration_value(OutputIt out, Rep val, int) {
+auto format_duration_value(OutputIt out, Rep val, int) -> OutputIt {
   return write<Char>(out, val);
 }
 
 template <typename Char, typename Rep, typename OutputIt,
           FMT_ENABLE_IF(std::is_floating_point<Rep>::value)>
-OutputIt format_duration_value(OutputIt out, Rep val, int precision) {
+auto format_duration_value(OutputIt out, Rep val, int precision) -> OutputIt {
   auto specs = format_specs<Char>();
   specs.precision = precision;
   specs.type = precision >= 0 ? presentation_type::fixed_lower
@@ -1672,12 +1708,12 @@ OutputIt format_duration_value(OutputIt out, Rep val, int precision) {
 }
 
 template <typename Char, typename OutputIt>
-OutputIt copy_unit(string_view unit, OutputIt out, Char) {
+auto copy_unit(string_view unit, OutputIt out, Char) -> OutputIt {
   return std::copy(unit.begin(), unit.end(), out);
 }
 
 template <typename OutputIt>
-OutputIt copy_unit(string_view unit, OutputIt out, wchar_t) {
+auto copy_unit(string_view unit, OutputIt out, wchar_t) -> OutputIt {
   // This works when wchar_t is UTF-32 because units only contain characters
   // that have the same representation in UTF-16 and UTF-32.
   utf8_to_utf16 u(unit);
@@ -1685,7 +1721,7 @@ OutputIt copy_unit(string_view unit, OutputIt out, wchar_t) {
 }
 
 template <typename Char, typename Period, typename OutputIt>
-OutputIt format_duration_unit(OutputIt out) {
+auto format_duration_unit(OutputIt out) -> OutputIt {
   if (const char* unit = get_units<Period>())
     return copy_unit(string_view(unit), out, Char());
   *out++ = '[';
@@ -1752,18 +1788,12 @@ struct chrono_formatter {
 
     // this may overflow and/or the result may not fit in the
     // target type.
-#if FMT_SAFE_DURATION_CAST
     // might need checked conversion (rep!=Rep)
-    auto tmpval = std::chrono::duration<rep, Period>(val);
-    s = fmt_safe_duration_cast<seconds>(tmpval);
-#else
-    s = std::chrono::duration_cast<seconds>(
-        std::chrono::duration<rep, Period>(val));
-#endif
+    s = fmt_duration_cast<seconds>(std::chrono::duration<rep, Period>(val));
   }
 
   // returns true if nan or inf, writes to out.
-  bool handle_nan_inf() {
+  auto handle_nan_inf() -> bool {
     if (isfinite(val)) {
       return false;
     }
@@ -1780,17 +1810,22 @@ struct chrono_formatter {
     return true;
   }
 
-  Rep hour() const { return static_cast<Rep>(mod((s.count() / 3600), 24)); }
+  auto days() const -> Rep { return static_cast<Rep>(s.count() / 86400); }
+  auto hour() const -> Rep {
+    return static_cast<Rep>(mod((s.count() / 3600), 24));
+  }
 
-  Rep hour12() const {
+  auto hour12() const -> Rep {
     Rep hour = static_cast<Rep>(mod((s.count() / 3600), 12));
     return hour <= 0 ? 12 : hour;
   }
 
-  Rep minute() const { return static_cast<Rep>(mod((s.count() / 60), 60)); }
-  Rep second() const { return static_cast<Rep>(mod(s.count(), 60)); }
+  auto minute() const -> Rep {
+    return static_cast<Rep>(mod((s.count() / 60), 60));
+  }
+  auto second() const -> Rep { return static_cast<Rep>(mod(s.count(), 60)); }
 
-  std::tm time() const {
+  auto time() const -> std::tm {
     auto time = std::tm();
     time.tm_hour = to_nonnegative_int(hour(), 24);
     time.tm_min = to_nonnegative_int(minute(), 60);
@@ -1858,10 +1893,14 @@ struct chrono_formatter {
   void on_dec0_week_of_year(numeric_system) {}
   void on_dec1_week_of_year(numeric_system) {}
   void on_iso_week_of_year(numeric_system) {}
-  void on_day_of_year() {}
   void on_day_of_month(numeric_system) {}
   void on_day_of_month_space(numeric_system) {}
 
+  void on_day_of_year() {
+    if (handle_nan_inf()) return;
+    write(days(), 0);
+  }
+
   void on_24_hour(numeric_system ns, pad_type pad) {
     if (handle_nan_inf()) return;
 
@@ -1968,7 +2007,7 @@ class weekday {
   weekday() = default;
   explicit constexpr weekday(unsigned wd) noexcept
       : value(static_cast<unsigned char>(wd != 7 ? wd : 0)) {}
-  constexpr unsigned c_encoding() const noexcept { return value; }
+  constexpr auto c_encoding() const noexcept -> unsigned { return value; }
 };
 
 class year_month_day {};
@@ -2083,25 +2122,22 @@ struct formatter<std::chrono::time_point<std::chrono::system_clock, Duration>,
             period::num != 1 || period::den != 1 ||
             std::is_floating_point<typename Duration::rep>::value)) {
       const auto epoch = val.time_since_epoch();
-      auto subsecs = std::chrono::duration_cast<Duration>(
-          epoch - std::chrono::duration_cast<std::chrono::seconds>(epoch));
+      auto subsecs = detail::fmt_duration_cast<Duration>(
+          epoch - detail::fmt_duration_cast<std::chrono::seconds>(epoch));
 
       if (subsecs.count() < 0) {
         auto second =
-            std::chrono::duration_cast<Duration>(std::chrono::seconds(1));
+            detail::fmt_duration_cast<Duration>(std::chrono::seconds(1));
         if (epoch.count() < ((Duration::min)() + second).count())
           FMT_THROW(format_error("duration is too small"));
         subsecs += second;
         val -= second;
       }
 
-      return formatter<std::tm, Char>::do_format(
-          gmtime(std::chrono::time_point_cast<std::chrono::seconds>(val)), ctx,
-          &subsecs);
+      return formatter<std::tm, Char>::do_format(gmtime(val), ctx, &subsecs);
     }
 
-    return formatter<std::tm, Char>::format(
-        gmtime(std::chrono::time_point_cast<std::chrono::seconds>(val)), ctx);
+    return formatter<std::tm, Char>::format(gmtime(val), ctx);
   }
 };
 
@@ -2120,17 +2156,13 @@ struct formatter<std::chrono::local_time<Duration>, Char>
     if (period::num != 1 || period::den != 1 ||
         std::is_floating_point<typename Duration::rep>::value) {
       const auto epoch = val.time_since_epoch();
-      const auto subsecs = std::chrono::duration_cast<Duration>(
-          epoch - std::chrono::duration_cast<std::chrono::seconds>(epoch));
+      const auto subsecs = detail::fmt_duration_cast<Duration>(
+          epoch - detail::fmt_duration_cast<std::chrono::seconds>(epoch));
 
-      return formatter<std::tm, Char>::do_format(
-          localtime(std::chrono::time_point_cast<std::chrono::seconds>(val)),
-          ctx, &subsecs);
+      return formatter<std::tm, Char>::do_format(localtime(val), ctx, &subsecs);
     }
 
-    return formatter<std::tm, Char>::format(
-        localtime(std::chrono::time_point_cast<std::chrono::seconds>(val)),
-        ctx);
+    return formatter<std::tm, Char>::format(localtime(val), ctx);
   }
 };
 #endif
diff --git a/src/fmt/color.h b/src/fmt/color.h
index 8697e1ca0b..464519e582 100644
--- a/src/fmt/color.h
+++ b/src/fmt/color.h
@@ -233,7 +233,7 @@ class text_style {
   FMT_CONSTEXPR text_style(emphasis em = emphasis()) noexcept
       : set_foreground_color(), set_background_color(), ems(em) {}
 
-  FMT_CONSTEXPR text_style& operator|=(const text_style& rhs) {
+  FMT_CONSTEXPR auto operator|=(const text_style& rhs) -> text_style& {
     if (!set_foreground_color) {
       set_foreground_color = rhs.set_foreground_color;
       foreground_color = rhs.foreground_color;
@@ -257,29 +257,29 @@ class text_style {
     return *this;
   }
 
-  friend FMT_CONSTEXPR text_style operator|(text_style lhs,
-                                            const text_style& rhs) {
+  friend FMT_CONSTEXPR auto operator|(text_style lhs, const text_style& rhs)
+      -> text_style {
     return lhs |= rhs;
   }
 
-  FMT_CONSTEXPR bool has_foreground() const noexcept {
+  FMT_CONSTEXPR auto has_foreground() const noexcept -> bool {
     return set_foreground_color;
   }
-  FMT_CONSTEXPR bool has_background() const noexcept {
+  FMT_CONSTEXPR auto has_background() const noexcept -> bool {
     return set_background_color;
   }
-  FMT_CONSTEXPR bool has_emphasis() const noexcept {
+  FMT_CONSTEXPR auto has_emphasis() const noexcept -> bool {
     return static_cast<uint8_t>(ems) != 0;
   }
-  FMT_CONSTEXPR detail::color_type get_foreground() const noexcept {
+  FMT_CONSTEXPR auto get_foreground() const noexcept -> detail::color_type {
     FMT_ASSERT(has_foreground(), "no foreground specified for this style");
     return foreground_color;
   }
-  FMT_CONSTEXPR detail::color_type get_background() const noexcept {
+  FMT_CONSTEXPR auto get_background() const noexcept -> detail::color_type {
     FMT_ASSERT(has_background(), "no background specified for this style");
     return background_color;
   }
-  FMT_CONSTEXPR emphasis get_emphasis() const noexcept {
+  FMT_CONSTEXPR auto get_emphasis() const noexcept -> emphasis {
     FMT_ASSERT(has_emphasis(), "no emphasis specified for this style");
     return ems;
   }
@@ -297,9 +297,11 @@ class text_style {
     }
   }
 
-  friend FMT_CONSTEXPR text_style fg(detail::color_type foreground) noexcept;
+  friend FMT_CONSTEXPR auto fg(detail::color_type foreground) noexcept
+      -> text_style;
 
-  friend FMT_CONSTEXPR text_style bg(detail::color_type background) noexcept;
+  friend FMT_CONSTEXPR auto bg(detail::color_type background) noexcept
+      -> text_style;
 
   detail::color_type foreground_color;
   detail::color_type background_color;
@@ -309,16 +311,19 @@ class text_style {
 };
 
 /** Creates a text style from the foreground (text) color. */
-FMT_CONSTEXPR inline text_style fg(detail::color_type foreground) noexcept {
+FMT_CONSTEXPR inline auto fg(detail::color_type foreground) noexcept
+    -> text_style {
   return text_style(true, foreground);
 }
 
 /** Creates a text style from the background color. */
-FMT_CONSTEXPR inline text_style bg(detail::color_type background) noexcept {
+FMT_CONSTEXPR inline auto bg(detail::color_type background) noexcept
+    -> text_style {
   return text_style(false, background);
 }
 
-FMT_CONSTEXPR inline text_style operator|(emphasis lhs, emphasis rhs) noexcept {
+FMT_CONSTEXPR inline auto operator|(emphasis lhs, emphasis rhs) noexcept
+    -> text_style {
   return text_style(lhs) | rhs;
 }
 
@@ -384,8 +389,8 @@ template <typename Char> struct ansi_color_escape {
   }
   FMT_CONSTEXPR operator const Char*() const noexcept { return buffer; }
 
-  FMT_CONSTEXPR const Char* begin() const noexcept { return buffer; }
-  FMT_CONSTEXPR_CHAR_TRAITS const Char* end() const noexcept {
+  FMT_CONSTEXPR auto begin() const noexcept -> const Char* { return buffer; }
+  FMT_CONSTEXPR20 auto end() const noexcept -> const Char* {
     return buffer + std::char_traits<Char>::length(buffer);
   }
 
@@ -400,25 +405,27 @@ template <typename Char> struct ansi_color_escape {
     out[2] = static_cast<Char>('0' + c % 10);
     out[3] = static_cast<Char>(delimiter);
   }
-  static FMT_CONSTEXPR bool has_emphasis(emphasis em, emphasis mask) noexcept {
+  static FMT_CONSTEXPR auto has_emphasis(emphasis em, emphasis mask) noexcept
+      -> bool {
     return static_cast<uint8_t>(em) & static_cast<uint8_t>(mask);
   }
 };
 
 template <typename Char>
-FMT_CONSTEXPR ansi_color_escape<Char> make_foreground_color(
-    detail::color_type foreground) noexcept {
+FMT_CONSTEXPR auto make_foreground_color(detail::color_type foreground) noexcept
+    -> ansi_color_escape<Char> {
   return ansi_color_escape<Char>(foreground, "\x1b[38;2;");
 }
 
 template <typename Char>
-FMT_CONSTEXPR ansi_color_escape<Char> make_background_color(
-    detail::color_type background) noexcept {
+FMT_CONSTEXPR auto make_background_color(detail::color_type background) noexcept
+    -> ansi_color_escape<Char> {
   return ansi_color_escape<Char>(background, "\x1b[48;2;");
 }
 
 template <typename Char>
-FMT_CONSTEXPR ansi_color_escape<Char> make_emphasis(emphasis em) noexcept {
+FMT_CONSTEXPR auto make_emphasis(emphasis em) noexcept
+    -> ansi_color_escape<Char> {
   return ansi_color_escape<Char>(em);
 }
 
@@ -427,9 +434,10 @@ template <typename Char> inline void reset_color(buffer<Char>& buffer) {
   buffer.append(reset_color.begin(), reset_color.end());
 }
 
-template <typename T> struct styled_arg {
+template <typename T> struct styled_arg : detail::view {
   const T& value;
   text_style style;
+  styled_arg(const T& v, text_style s) : value(v), style(s) {}
 };
 
 template <typename Char>
@@ -510,9 +518,10 @@ void print(const text_style& ts, const S& format_str, const Args&... args) {
 }
 
 template <typename S, typename Char = char_t<S>>
-inline std::basic_string<Char> vformat(
+inline auto vformat(
     const text_style& ts, const S& format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+    basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> std::basic_string<Char> {
   basic_memory_buffer<Char> buf;
   detail::vformat_to(buf, ts, detail::to_string_view(format_str), args);
   return fmt::to_string(buf);
@@ -531,8 +540,8 @@ inline std::basic_string<Char> vformat(
   \endrst
 */
 template <typename S, typename... Args, typename Char = char_t<S>>
-inline std::basic_string<Char> format(const text_style& ts, const S& format_str,
-                                      const Args&... args) {
+inline auto format(const text_style& ts, const S& format_str,
+                   const Args&... args) -> std::basic_string<Char> {
   return fmt::vformat(ts, detail::to_string_view(format_str),
                       fmt::make_format_args<buffer_context<Char>>(args...));
 }
@@ -542,9 +551,10 @@ inline std::basic_string<Char> format(const text_style& ts, const S& format_str,
  */
 template <typename OutputIt, typename Char,
           FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
-OutputIt vformat_to(
-    OutputIt out, const text_style& ts, basic_string_view<Char> format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+auto vformat_to(OutputIt out, const text_style& ts,
+                basic_string_view<Char> format_str,
+                basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> OutputIt {
   auto&& buf = detail::get_buffer<Char>(out);
   detail::vformat_to(buf, ts, format_str, args);
   return detail::get_iterator(buf, out);
@@ -562,9 +572,10 @@ OutputIt vformat_to(
                    fmt::emphasis::bold | fg(fmt::color::red), "{}", 42);
   \endrst
 */
-template <typename OutputIt, typename S, typename... Args,
-          bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value&&
-              detail::is_string<S>::value>
+template <
+    typename OutputIt, typename S, typename... Args,
+    bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value &&
+                  detail::is_string<S>::value>
 inline auto format_to(OutputIt out, const text_style& ts, const S& format_str,
                       Args&&... args) ->
     typename std::enable_if<enable, OutputIt>::type {
diff --git a/src/fmt/compile.h b/src/fmt/compile.h
index af76507f07..71fa69c67e 100644
--- a/src/fmt/compile.h
+++ b/src/fmt/compile.h
@@ -14,8 +14,8 @@ FMT_BEGIN_NAMESPACE
 namespace detail {
 
 template <typename Char, typename InputIt>
-FMT_CONSTEXPR inline counting_iterator copy_str(InputIt begin, InputIt end,
-                                                counting_iterator it) {
+FMT_CONSTEXPR inline auto copy_str(InputIt begin, InputIt end,
+                                   counting_iterator it) -> counting_iterator {
   return it + (end - begin);
 }
 
@@ -57,7 +57,7 @@ struct udl_compiled_string : compiled_string {
 #endif
 
 template <typename T, typename... Tail>
-const T& first(const T& value, const Tail&...) {
+auto first(const T& value, const Tail&...) -> const T& {
   return value;
 }
 
@@ -489,18 +489,19 @@ FMT_CONSTEXPR OutputIt format_to(OutputIt out, const S&, Args&&... args) {
 
 template <typename OutputIt, typename S, typename... Args,
           FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
-format_to_n_result<OutputIt> format_to_n(OutputIt out, size_t n,
-                                         const S& format_str, Args&&... args) {
+auto format_to_n(OutputIt out, size_t n, const S& format_str, Args&&... args)
+    -> format_to_n_result<OutputIt> {
   using traits = detail::fixed_buffer_traits;
   auto buf = detail::iterator_buffer<OutputIt, char, traits>(out, n);
-  format_to(std::back_inserter(buf), format_str, std::forward<Args>(args)...);
+  fmt::format_to(std::back_inserter(buf), format_str,
+                 std::forward<Args>(args)...);
   return {buf.out(), buf.count()};
 }
 
 template <typename S, typename... Args,
           FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
-FMT_CONSTEXPR20 size_t formatted_size(const S& format_str,
-                                      const Args&... args) {
+FMT_CONSTEXPR20 auto formatted_size(const S& format_str, const Args&... args)
+    -> size_t {
   return fmt::format_to(detail::counting_iterator(), format_str, args...)
       .count();
 }
diff --git a/src/fmt/core.h b/src/fmt/core.h
index 9f7de781bb..6a53b8c52c 100644
--- a/src/fmt/core.h
+++ b/src/fmt/core.h
@@ -8,17 +8,15 @@
 #ifndef FMT_CORE_H_
 #define FMT_CORE_H_
 
-#include <cstddef>  // std::byte
-#include <cstdio>   // std::FILE
-#include <cstring>  // std::strlen
-#include <iterator>
-#include <limits>
-#include <memory>  // std::addressof
-#include <string>
-#include <type_traits>
+#include <cstddef>      // std::byte
+#include <cstdio>       // std::FILE
+#include <cstring>      // std::strlen
+#include <limits.h>     // CHAR_BIT
+#include <string>       // std::string
+#include <type_traits>  // std::enable_if
 
 // The fmt library version in the form major * 10000 + minor * 100 + patch.
-#define FMT_VERSION 100100
+#define FMT_VERSION 100200
 
 #if defined(__clang__) && !defined(__ibmxl__)
 #  define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
@@ -58,6 +56,12 @@
 #  define FMT_MSC_WARNING(...)
 #endif
 
+#ifdef _GLIBCXX_RELEASE
+#  define FMT_GLIBCXX_RELEASE _GLIBCXX_RELEASE
+#else
+#  define FMT_GLIBCXX_RELEASE 0
+#endif
+
 #ifdef _MSVC_LANG
 #  define FMT_CPLUSPLUS _MSVC_LANG
 #else
@@ -88,6 +92,20 @@
 #define FMT_HAS_CPP17_ATTRIBUTE(attribute) \
   (FMT_CPLUSPLUS >= 201703L && FMT_HAS_CPP_ATTRIBUTE(attribute))
 
+#ifndef FMT_DEPRECATED
+#  if FMT_HAS_CPP14_ATTRIBUTE(deprecated) || FMT_MSC_VERSION >= 1900
+#    define FMT_DEPRECATED [[deprecated]]
+#  else
+#    if (defined(__GNUC__) && !defined(__LCC__)) || defined(__clang__)
+#      define FMT_DEPRECATED __attribute__((deprecated))
+#    elif FMT_MSC_VERSION
+#      define FMT_DEPRECATED __declspec(deprecated)
+#    else
+#      define FMT_DEPRECATED /* deprecated */
+#    endif
+#  endif
+#endif
+
 // Check if relaxed C++14 constexpr is supported.
 // GCC doesn't allow throw in constexpr until version 6 (bug 67371).
 #ifndef FMT_USE_CONSTEXPR
@@ -105,30 +123,17 @@
 #  define FMT_CONSTEXPR
 #endif
 
-#if ((FMT_CPLUSPLUS >= 202002L) &&                            \
-     (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE > 9)) || \
-    (FMT_CPLUSPLUS >= 201709L && FMT_GCC_VERSION >= 1002)
+#if (FMT_CPLUSPLUS >= 202002L ||                                \
+     (FMT_CPLUSPLUS >= 201709L && FMT_GCC_VERSION >= 1002)) &&  \
+    ((!FMT_GLIBCXX_RELEASE || FMT_GLIBCXX_RELEASE >= 10) &&     \
+     (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION >= 10000) && \
+     (!FMT_MSC_VERSION || FMT_MSC_VERSION >= 1928)) &&          \
+    defined(__cpp_lib_is_constant_evaluated)
 #  define FMT_CONSTEXPR20 constexpr
 #else
 #  define FMT_CONSTEXPR20
 #endif
 
-// Check if constexpr std::char_traits<>::{compare,length} are supported.
-#if defined(__GLIBCXX__)
-#  if FMT_CPLUSPLUS >= 201703L && defined(_GLIBCXX_RELEASE) && \
-      _GLIBCXX_RELEASE >= 7  // GCC 7+ libstdc++ has _GLIBCXX_RELEASE.
-#    define FMT_CONSTEXPR_CHAR_TRAITS constexpr
-#  endif
-#elif defined(_LIBCPP_VERSION) && FMT_CPLUSPLUS >= 201703L && \
-    _LIBCPP_VERSION >= 4000
-#  define FMT_CONSTEXPR_CHAR_TRAITS constexpr
-#elif FMT_MSC_VERSION >= 1914 && FMT_CPLUSPLUS >= 201703L
-#  define FMT_CONSTEXPR_CHAR_TRAITS constexpr
-#endif
-#ifndef FMT_CONSTEXPR_CHAR_TRAITS
-#  define FMT_CONSTEXPR_CHAR_TRAITS
-#endif
-
 // Check if exceptions are disabled.
 #ifndef FMT_EXCEPTIONS
 #  if (defined(__GNUC__) && !defined(__EXCEPTIONS)) || \
@@ -191,33 +196,25 @@
 #  define FMT_END_EXPORT
 #endif
 
+#if FMT_GCC_VERSION || FMT_CLANG_VERSION
+#  define FMT_VISIBILITY(value) __attribute__((visibility(value)))
+#else
+#  define FMT_VISIBILITY(value)
+#endif
+
 #if !defined(FMT_HEADER_ONLY) && defined(_WIN32)
-#  ifdef FMT_LIB_EXPORT
+#  if defined(FMT_LIB_EXPORT)
 #    define FMT_API __declspec(dllexport)
 #  elif defined(FMT_SHARED)
 #    define FMT_API __declspec(dllimport)
 #  endif
-#else
-#  if defined(FMT_LIB_EXPORT) || defined(FMT_SHARED)
-#    if defined(__GNUC__) || defined(__clang__)
-#      define FMT_API __attribute__((visibility("default")))
-#    endif
-#  endif
+#elif defined(FMT_LIB_EXPORT) || defined(FMT_SHARED)
+#  define FMT_API FMT_VISIBILITY("default")
 #endif
 #ifndef FMT_API
 #  define FMT_API
 #endif
 
-// libc++ supports string_view in pre-c++17.
-#if FMT_HAS_INCLUDE(<string_view>) && \
-    (FMT_CPLUSPLUS >= 201703L || defined(_LIBCPP_VERSION))
-#  include <string_view>
-#  define FMT_USE_STRING_VIEW
-#elif FMT_HAS_INCLUDE("experimental/string_view") && FMT_CPLUSPLUS >= 201402L
-#  include <experimental/string_view>
-#  define FMT_USE_EXPERIMENTAL_STRING_VIEW
-#endif
-
 #ifndef FMT_UNICODE
 #  define FMT_UNICODE !FMT_MSC_VERSION
 #endif
@@ -228,8 +225,9 @@
         __apple_build_version__ >= 14000029L) &&                 \
        FMT_CPLUSPLUS >= 202002L) ||                              \
       (defined(__cpp_consteval) &&                               \
-       (!FMT_MSC_VERSION || _MSC_FULL_VER >= 193030704))
-// consteval is broken in MSVC before VS2022 and Apple clang before 14.
+       (!FMT_MSC_VERSION || FMT_MSC_VERSION >= 1929))
+// consteval is broken in MSVC before VS2019 version 16.10 and Apple clang
+// before 14.
 #    define FMT_CONSTEVAL consteval
 #    define FMT_HAS_CONSTEVAL
 #  else
@@ -248,6 +246,15 @@
 #  endif
 #endif
 
+// GCC < 5 requires this-> in decltype.
+#ifndef FMT_DECLTYPE_THIS
+#  if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
+#    define FMT_DECLTYPE_THIS this->
+#  else
+#    define FMT_DECLTYPE_THIS
+#  endif
+#endif
+
 // Enable minimal optimizations for more compact code in debug mode.
 FMT_GCC_PRAGMA("GCC push_options")
 #if !defined(__OPTIMIZE__) && !defined(__NVCOMPILER) && !defined(__LCC__) && \
@@ -269,20 +276,57 @@ template <typename T>
 using remove_const_t = typename std::remove_const<T>::type;
 template <typename T>
 using remove_cvref_t = typename std::remove_cv<remove_reference_t<T>>::type;
-template <typename T> struct type_identity { using type = T; };
+template <typename T> struct type_identity {
+  using type = T;
+};
 template <typename T> using type_identity_t = typename type_identity<T>::type;
 template <typename T>
 using underlying_t = typename std::underlying_type<T>::type;
 
-// Checks whether T is a container with contiguous storage.
-template <typename T> struct is_contiguous : std::false_type {};
-template <typename Char>
-struct is_contiguous<std::basic_string<Char>> : std::true_type {};
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
+// A workaround for gcc 4.8 to make void_t work in a SFINAE context.
+template <typename...> struct void_t_impl {
+  using type = void;
+};
+template <typename... T> using void_t = typename void_t_impl<T...>::type;
+#else
+template <typename...> using void_t = void;
+#endif
 
 struct monostate {
   constexpr monostate() {}
 };
 
+// An implementation of back_insert_iterator to avoid dependency on <iterator>.
+template <typename Container> class back_insert_iterator {
+ private:
+  Container* container_;
+
+  friend auto get_container(back_insert_iterator it) -> Container& {
+    return *it.container_;
+  }
+
+ public:
+  using difference_type = ptrdiff_t;
+  FMT_UNCHECKED_ITERATOR(back_insert_iterator);
+
+  explicit back_insert_iterator(Container& c) : container_(&c) {}
+
+  auto operator=(const typename Container::value_type& value)
+      -> back_insert_iterator& {
+    container_->push_back(value);
+    return *this;
+  }
+  auto operator*() -> back_insert_iterator& { return *this; }
+  auto operator++() -> back_insert_iterator& { return *this; }
+  auto operator++(int) -> back_insert_iterator { return *this; }
+};
+
+template <typename Container>
+auto back_inserter(Container& c) -> back_insert_iterator<Container> {
+  return {c};
+}
+
 // An enable_if helper to be used in template parameters which results in much
 // shorter symbols: https://godbolt.org/z/sWw4vP. Extra parentheses are needed
 // to workaround a bug in MSVC 2019 (see #1140 and #1186).
@@ -310,10 +354,9 @@ template <typename... T> FMT_CONSTEXPR void ignore_unused(const T&...) {}
 constexpr FMT_INLINE auto is_constant_evaluated(
     bool default_value = false) noexcept -> bool {
 // Workaround for incompatibility between libstdc++ consteval-based
-// std::is_constant_evaluated() implementation and clang-14.
-// https://github.com/fmtlib/fmt/issues/3247
-#if FMT_CPLUSPLUS >= 202002L && defined(_GLIBCXX_RELEASE) && \
-    _GLIBCXX_RELEASE >= 12 &&                                \
+// std::is_constant_evaluated() implementation and clang-14:
+// https://github.com/fmtlib/fmt/issues/3247.
+#if FMT_CPLUSPLUS >= 202002L && FMT_GLIBCXX_RELEASE >= 12 && \
     (FMT_CLANG_VERSION >= 1400 && FMT_CLANG_VERSION < 1500)
   ignore_unused(default_value);
   return __builtin_is_constant_evaluated();
@@ -346,15 +389,6 @@ FMT_NORETURN FMT_API void assert_fail(const char* file, int line,
 #  endif
 #endif
 
-#if defined(FMT_USE_STRING_VIEW)
-template <typename Char> using std_string_view = std::basic_string_view<Char>;
-#elif defined(FMT_USE_EXPERIMENTAL_STRING_VIEW)
-template <typename Char>
-using std_string_view = std::experimental::basic_string_view<Char>;
-#else
-template <typename T> struct std_string_view {};
-#endif
-
 #ifdef FMT_USE_INT128
 // Do nothing.
 #elif defined(__SIZEOF_INT128__) && !defined(__NVCC__) && \
@@ -386,6 +420,15 @@ FMT_CONSTEXPR auto to_unsigned(Int value) ->
   return static_cast<typename std::make_unsigned<Int>::type>(value);
 }
 
+template <typename T, typename Enable = void>
+struct is_string_like : std::false_type {};
+
+// A heuristic to detect std::string and std::string_view.
+template <typename T>
+struct is_string_like<T, void_t<decltype(std::declval<T>().find_first_of(
+                             typename T::value_type(), 0))>> : std::true_type {
+};
+
 FMT_CONSTEXPR inline auto is_utf8() -> bool {
   FMT_MSC_WARNING(suppress : 4566) constexpr unsigned char section[] = "\u00A7";
 
@@ -394,8 +437,33 @@ FMT_CONSTEXPR inline auto is_utf8() -> bool {
   return FMT_UNICODE || (sizeof(section) == 3 && uchar(section[0]) == 0xC2 &&
                          uchar(section[1]) == 0xA7);
 }
+
+template <typename Char> FMT_CONSTEXPR auto length(const Char* s) -> size_t {
+  size_t len = 0;
+  while (*s++) ++len;
+  return len;
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto compare(const Char* s1, const Char* s2, std::size_t n)
+    -> int {
+  for (; n != 0; ++s1, ++s2, --n) {
+    if (*s1 < *s2) return -1;
+    if (*s1 > *s2) return 1;
+  }
+  return 0;
+}
 }  // namespace detail
 
+template <typename Char>
+using basic_string =
+    std::basic_string<Char, std::char_traits<Char>, std::allocator<Char>>;
+
+// Checks whether T is a container with contiguous storage.
+template <typename T> struct is_contiguous : std::false_type {};
+template <typename Char>
+struct is_contiguous<basic_string<Char>> : std::true_type {};
+
 /**
   An implementation of ``std::basic_string_view`` for pre-C++17. It provides a
   subset of the API. ``fmt::basic_string_view`` is used for format strings even
@@ -420,29 +488,25 @@ template <typename Char> class basic_string_view {
       : data_(s), size_(count) {}
 
   /**
-    \rst
-    Constructs a string reference object from a C string computing
-    the size with ``std::char_traits<Char>::length``.
-    \endrst
+    Constructs a string reference object from a C string.
    */
-  FMT_CONSTEXPR_CHAR_TRAITS
+  FMT_CONSTEXPR20
   FMT_INLINE
   basic_string_view(const Char* s)
       : data_(s),
         size_(detail::const_check(std::is_same<Char, char>::value &&
-                                  !detail::is_constant_evaluated(true))
+                                  !detail::is_constant_evaluated(false))
                   ? std::strlen(reinterpret_cast<const char*>(s))
-                  : std::char_traits<Char>::length(s)) {}
+                  : detail::length(s)) {}
 
-  /** Constructs a string reference from a ``std::basic_string`` object. */
-  template <typename Traits, typename Alloc>
-  FMT_CONSTEXPR basic_string_view(
-      const std::basic_string<Char, Traits, Alloc>& s) noexcept
-      : data_(s.data()), size_(s.size()) {}
-
-  template <typename S, FMT_ENABLE_IF(std::is_same<
-                                      S, detail::std_string_view<Char>>::value)>
-  FMT_CONSTEXPR basic_string_view(S s) noexcept
+  /**
+    Constructs a string reference from a ``std::basic_string`` or a
+    ``std::basic_string_view`` object.
+  */
+  template <typename S,
+            FMT_ENABLE_IF(detail::is_string_like<S>::value&& std::is_same<
+                          typename S::value_type, Char>::value)>
+  FMT_CONSTEXPR basic_string_view(const S& s) noexcept
       : data_(s.data()), size_(s.size()) {}
 
   /** Returns a pointer to the string data. */
@@ -463,30 +527,28 @@ template <typename Char> class basic_string_view {
     size_ -= n;
   }
 
-  FMT_CONSTEXPR_CHAR_TRAITS bool starts_with(
-      basic_string_view<Char> sv) const noexcept {
-    return size_ >= sv.size_ &&
-           std::char_traits<Char>::compare(data_, sv.data_, sv.size_) == 0;
+  FMT_CONSTEXPR auto starts_with(basic_string_view<Char> sv) const noexcept
+      -> bool {
+    return size_ >= sv.size_ && detail::compare(data_, sv.data_, sv.size_) == 0;
   }
-  FMT_CONSTEXPR_CHAR_TRAITS bool starts_with(Char c) const noexcept {
-    return size_ >= 1 && std::char_traits<Char>::eq(*data_, c);
+  FMT_CONSTEXPR auto starts_with(Char c) const noexcept -> bool {
+    return size_ >= 1 && *data_ == c;
   }
-  FMT_CONSTEXPR_CHAR_TRAITS bool starts_with(const Char* s) const {
+  FMT_CONSTEXPR auto starts_with(const Char* s) const -> bool {
     return starts_with(basic_string_view<Char>(s));
   }
 
   // Lexicographically compare this string reference to other.
-  FMT_CONSTEXPR_CHAR_TRAITS auto compare(basic_string_view other) const -> int {
+  FMT_CONSTEXPR auto compare(basic_string_view other) const -> int {
     size_t str_size = size_ < other.size_ ? size_ : other.size_;
-    int result = std::char_traits<Char>::compare(data_, other.data_, str_size);
+    int result = detail::compare(data_, other.data_, str_size);
     if (result == 0)
       result = size_ == other.size_ ? 0 : (size_ < other.size_ ? -1 : 1);
     return result;
   }
 
-  FMT_CONSTEXPR_CHAR_TRAITS friend auto operator==(basic_string_view lhs,
-                                                   basic_string_view rhs)
-      -> bool {
+  FMT_CONSTEXPR friend auto operator==(basic_string_view lhs,
+                                       basic_string_view rhs) -> bool {
     return lhs.compare(rhs) == 0;
   }
   friend auto operator!=(basic_string_view lhs, basic_string_view rhs) -> bool {
@@ -526,21 +588,16 @@ template <typename Char, FMT_ENABLE_IF(is_char<Char>::value)>
 FMT_INLINE auto to_string_view(const Char* s) -> basic_string_view<Char> {
   return s;
 }
-template <typename Char, typename Traits, typename Alloc>
-inline auto to_string_view(const std::basic_string<Char, Traits, Alloc>& s)
-    -> basic_string_view<Char> {
-  return s;
+template <typename S, FMT_ENABLE_IF(is_string_like<S>::value)>
+inline auto to_string_view(const S& s)
+    -> basic_string_view<typename S::value_type> {
+  return s;  // std::basic_string[_view]
 }
 template <typename Char>
 constexpr auto to_string_view(basic_string_view<Char> s)
     -> basic_string_view<Char> {
   return s;
 }
-template <typename Char,
-          FMT_ENABLE_IF(!std::is_empty<std_string_view<Char>>::value)>
-inline auto to_string_view(std_string_view<Char> s) -> basic_string_view<Char> {
-  return s;
-}
 template <typename S, FMT_ENABLE_IF(is_compile_string<S>::value)>
 constexpr auto to_string_view(const S& s)
     -> basic_string_view<typename S::char_type> {
@@ -609,10 +666,10 @@ FMT_TYPE_CONSTANT(const Char*, cstring_type);
 FMT_TYPE_CONSTANT(basic_string_view<Char>, string_type);
 FMT_TYPE_CONSTANT(const void*, pointer_type);
 
-constexpr bool is_integral_type(type t) {
+constexpr auto is_integral_type(type t) -> bool {
   return t > type::none_type && t <= type::last_integer_type;
 }
-constexpr bool is_arithmetic_type(type t) {
+constexpr auto is_arithmetic_type(type t) -> bool {
   return t > type::none_type && t <= type::last_numeric_type;
 }
 
@@ -635,21 +692,10 @@ enum {
   cstring_set = set(type::cstring_type),
   pointer_set = set(type::pointer_type)
 };
-
-FMT_NORETURN FMT_API void throw_format_error(const char* message);
-
-struct error_handler {
-  constexpr error_handler() = default;
-
-  // This function is intentionally not constexpr to give a compile-time error.
-  FMT_NORETURN void on_error(const char* message) {
-    throw_format_error(message);
-  }
-};
 }  // namespace detail
 
 /** Throws ``format_error`` with a given message. */
-using detail::throw_format_error;
+FMT_NORETURN FMT_API void throw_format_error(const char* message);
 
 /** String's character type. */
 template <typename S> using char_t = typename detail::char_t_impl<S>::type;
@@ -701,7 +747,7 @@ template <typename Char> class basic_format_parse_context {
    */
   FMT_CONSTEXPR auto next_arg_id() -> int {
     if (next_arg_id_ < 0) {
-      detail::throw_format_error(
+      throw_format_error(
           "cannot switch from manual to automatic argument indexing");
       return 0;
     }
@@ -716,7 +762,7 @@ template <typename Char> class basic_format_parse_context {
    */
   FMT_CONSTEXPR void check_arg_id(int id) {
     if (next_arg_id_ > 0) {
-      detail::throw_format_error(
+      throw_format_error(
           "cannot switch from automatic to manual argument indexing");
       return;
     }
@@ -769,35 +815,6 @@ class compile_parse_context : public basic_format_parse_context<Char> {
   }
 };
 
-// Extracts a reference to the container from back_insert_iterator.
-template <typename Container>
-inline auto get_container(std::back_insert_iterator<Container> it)
-    -> Container& {
-  using base = std::back_insert_iterator<Container>;
-  struct accessor : base {
-    accessor(base b) : base(b) {}
-    using base::container;
-  };
-  return *accessor(it).container;
-}
-
-template <typename Char, typename InputIt, typename OutputIt>
-FMT_CONSTEXPR auto copy_str(InputIt begin, InputIt end, OutputIt out)
-    -> OutputIt {
-  while (begin != end) *out++ = static_cast<Char>(*begin++);
-  return out;
-}
-
-template <typename Char, typename T, typename U,
-          FMT_ENABLE_IF(
-              std::is_same<remove_const_t<T>, U>::value&& is_char<U>::value)>
-FMT_CONSTEXPR auto copy_str(T* begin, T* end, U* out) -> U* {
-  if (is_constant_evaluated()) return copy_str<Char, T*, U*>(begin, end, out);
-  auto size = to_unsigned(end - begin);
-  if (size > 0) memcpy(out, begin, size * sizeof(U));
-  return out + size;
-}
-
 /**
   \rst
   A contiguous memory buffer with an optional growing ability. It is an internal
@@ -810,13 +827,18 @@ template <typename T> class buffer {
   size_t size_;
   size_t capacity_;
 
+  using grow_fun = void (*)(buffer& buf, size_t capacity);
+  grow_fun grow_;
+
  protected:
   // Don't initialize ptr_ since it is not accessed to save a few cycles.
   FMT_MSC_WARNING(suppress : 26495)
-  buffer(size_t sz) noexcept : size_(sz), capacity_(sz) {}
+  FMT_CONSTEXPR buffer(grow_fun grow, size_t sz) noexcept
+      : size_(sz), capacity_(sz), grow_(grow) {}
 
-  FMT_CONSTEXPR20 buffer(T* p = nullptr, size_t sz = 0, size_t cap = 0) noexcept
-      : ptr_(p), size_(sz), capacity_(cap) {}
+  FMT_CONSTEXPR20 buffer(grow_fun grow, T* p = nullptr, size_t sz = 0,
+                         size_t cap = 0) noexcept
+      : ptr_(p), size_(sz), capacity_(cap), grow_(grow) {}
 
   FMT_CONSTEXPR20 ~buffer() = default;
   buffer(buffer&&) = default;
@@ -827,9 +849,6 @@ template <typename T> class buffer {
     capacity_ = buf_capacity;
   }
 
-  /** Increases the buffer capacity to hold at least *capacity* elements. */
-  virtual FMT_CONSTEXPR20 void grow(size_t capacity) = 0;
-
  public:
   using value_type = T;
   using const_reference = const T&;
@@ -868,7 +887,7 @@ template <typename T> class buffer {
   // for at least one additional element either by increasing the capacity or by
   // flushing the buffer if it is full.
   FMT_CONSTEXPR20 void try_reserve(size_t new_capacity) {
-    if (new_capacity > capacity_) grow(new_capacity);
+    if (new_capacity > capacity_) grow_(*this, new_capacity);
   }
 
   FMT_CONSTEXPR20 void push_back(const T& value) {
@@ -917,22 +936,25 @@ class iterator_buffer final : public Traits, public buffer<T> {
   enum { buffer_size = 256 };
   T data_[buffer_size];
 
- protected:
-  FMT_CONSTEXPR20 void grow(size_t) override {
-    if (this->size() == buffer_size) flush();
+  static FMT_CONSTEXPR20 void grow(buffer<T>& buf, size_t) {
+    if (buf.size() == buffer_size) static_cast<iterator_buffer&>(buf).flush();
   }
 
   void flush() {
     auto size = this->size();
     this->clear();
-    out_ = copy_str<T>(data_, data_ + this->limit(size), out_);
+    const T* begin = data_;
+    const T* end = begin + this->limit(size);
+    while (begin != end) *out_++ = *begin++;
   }
 
  public:
   explicit iterator_buffer(OutputIt out, size_t n = buffer_size)
-      : Traits(n), buffer<T>(data_, 0, buffer_size), out_(out) {}
+      : Traits(n), buffer<T>(grow, data_, 0, buffer_size), out_(out) {}
   iterator_buffer(iterator_buffer&& other)
-      : Traits(other), buffer<T>(data_, 0, buffer_size), out_(other.out_) {}
+      : Traits(other),
+        buffer<T>(grow, data_, 0, buffer_size),
+        out_(other.out_) {}
   ~iterator_buffer() { flush(); }
 
   auto out() -> OutputIt {
@@ -951,9 +973,9 @@ class iterator_buffer<T*, T, fixed_buffer_traits> final
   enum { buffer_size = 256 };
   T data_[buffer_size];
 
- protected:
-  FMT_CONSTEXPR20 void grow(size_t) override {
-    if (this->size() == this->capacity()) flush();
+  static FMT_CONSTEXPR20 void grow(buffer<T>& buf, size_t) {
+    if (buf.size() == buf.capacity())
+      static_cast<iterator_buffer&>(buf).flush();
   }
 
   void flush() {
@@ -967,10 +989,10 @@ class iterator_buffer<T*, T, fixed_buffer_traits> final
 
  public:
   explicit iterator_buffer(T* out, size_t n = buffer_size)
-      : fixed_buffer_traits(n), buffer<T>(out, 0, n), out_(out) {}
+      : fixed_buffer_traits(n), buffer<T>(grow, out, 0, n), out_(out) {}
   iterator_buffer(iterator_buffer&& other)
       : fixed_buffer_traits(other),
-        buffer<T>(std::move(other)),
+        buffer<T>(static_cast<iterator_buffer&&>(other)),
         out_(other.out_) {
     if (this->data() != out_) {
       this->set(data_, buffer_size);
@@ -989,38 +1011,37 @@ class iterator_buffer<T*, T, fixed_buffer_traits> final
 };
 
 template <typename T> class iterator_buffer<T*, T> final : public buffer<T> {
- protected:
-  FMT_CONSTEXPR20 void grow(size_t) override {}
-
  public:
-  explicit iterator_buffer(T* out, size_t = 0) : buffer<T>(out, 0, ~size_t()) {}
+  explicit iterator_buffer(T* out, size_t = 0)
+      : buffer<T>([](buffer<T>&, size_t) {}, out, 0, ~size_t()) {}
 
   auto out() -> T* { return &*this->end(); }
 };
 
 // A buffer that writes to a container with the contiguous storage.
 template <typename Container>
-class iterator_buffer<std::back_insert_iterator<Container>,
+class iterator_buffer<back_insert_iterator<Container>,
                       enable_if_t<is_contiguous<Container>::value,
                                   typename Container::value_type>>
     final : public buffer<typename Container::value_type> {
  private:
+  using value_type = typename Container::value_type;
   Container& container_;
 
- protected:
-  FMT_CONSTEXPR20 void grow(size_t capacity) override {
-    container_.resize(capacity);
-    this->set(&container_[0], capacity);
+  static FMT_CONSTEXPR20 void grow(buffer<value_type>& buf, size_t capacity) {
+    auto& self = static_cast<iterator_buffer&>(buf);
+    self.container_.resize(capacity);
+    self.set(&self.container_[0], capacity);
   }
 
  public:
   explicit iterator_buffer(Container& c)
-      : buffer<typename Container::value_type>(c.size()), container_(c) {}
-  explicit iterator_buffer(std::back_insert_iterator<Container> out, size_t = 0)
+      : buffer<value_type>(grow, c.size()), container_(c) {}
+  explicit iterator_buffer(back_insert_iterator<Container> out, size_t = 0)
       : iterator_buffer(get_container(out)) {}
 
-  auto out() -> std::back_insert_iterator<Container> {
-    return std::back_inserter(container_);
+  auto out() -> back_insert_iterator<Container> {
+    return fmt::back_inserter(container_);
   }
 };
 
@@ -1031,15 +1052,14 @@ template <typename T = char> class counting_buffer final : public buffer<T> {
   T data_[buffer_size];
   size_t count_ = 0;
 
- protected:
-  FMT_CONSTEXPR20 void grow(size_t) override {
-    if (this->size() != buffer_size) return;
-    count_ += this->size();
-    this->clear();
+  static FMT_CONSTEXPR20 void grow(buffer<T>& buf, size_t) {
+    if (buf.size() != buffer_size) return;
+    static_cast<counting_buffer&>(buf).count_ += buf.size();
+    buf.clear();
   }
 
  public:
-  counting_buffer() : buffer<T>(data_, 0, buffer_size) {}
+  counting_buffer() : buffer<T>(grow, data_, 0, buffer_size) {}
 
   auto count() -> size_t { return count_ + this->size(); }
 };
@@ -1053,7 +1073,7 @@ FMT_CONSTEXPR void basic_format_parse_context<Char>::do_check_arg_id(int id) {
       (!FMT_GCC_VERSION || FMT_GCC_VERSION >= 1200)) {
     using context = detail::compile_parse_context<Char>;
     if (id >= static_cast<context*>(this)->num_args())
-      detail::throw_format_error("argument not found");
+      throw_format_error("argument not found");
   }
 }
 
@@ -1085,18 +1105,29 @@ template <typename T, typename Context>
 using has_formatter =
     std::is_constructible<typename Context::template formatter_type<T>>;
 
-// An output iterator that appends to a buffer.
-// It is used to reduce symbol sizes for the common case.
-class appender : public std::back_insert_iterator<detail::buffer<char>> {
-  using base = std::back_insert_iterator<detail::buffer<char>>;
+// An output iterator that appends to a buffer. It is used instead of
+// back_insert_iterator to reduce symbol sizes for the common case.
+class appender {
+ private:
+  detail::buffer<char>* buffer_;
+
+  friend auto get_container(appender app) -> detail::buffer<char>& {
+    return *app.buffer_;
+  }
 
  public:
-  using std::back_insert_iterator<detail::buffer<char>>::back_insert_iterator;
-  appender(base it) noexcept : base(it) {}
+  using difference_type = ptrdiff_t;
   FMT_UNCHECKED_ITERATOR(appender);
 
-  auto operator++() noexcept -> appender& { return *this; }
-  auto operator++(int) noexcept -> appender { return *this; }
+  appender(detail::buffer<char>& buf) : buffer_(&buf) {}
+
+  auto operator=(char c) -> appender& {
+    buffer_->push_back(c);
+    return *this;
+  }
+  auto operator*() -> appender& { return *this; }
+  auto operator++() -> appender& { return *this; }
+  auto operator++(int) -> appender { return *this; }
 };
 
 namespace detail {
@@ -1119,7 +1150,7 @@ constexpr auto has_const_formatter() -> bool {
 
 template <typename T>
 using buffer_appender = conditional_t<std::is_same<T, char>::value, appender,
-                                      std::back_insert_iterator<buffer<T>>>;
+                                      back_insert_iterator<buffer<T>>>;
 
 // Maps an output iterator to a buffer.
 template <typename T, typename OutputIt>
@@ -1128,7 +1159,7 @@ auto get_buffer(OutputIt out) -> iterator_buffer<OutputIt, T> {
 }
 template <typename T, typename Buf,
           FMT_ENABLE_IF(std::is_base_of<buffer<char>, Buf>::value)>
-auto get_buffer(std::back_insert_iterator<Buf> out) -> buffer<char>& {
+auto get_buffer(back_insert_iterator<Buf> out) -> buffer<char>& {
   return get_container(out);
 }
 
@@ -1293,7 +1324,13 @@ template <typename Context> class value {
 
   template <typename T> FMT_CONSTEXPR20 FMT_INLINE value(T& val) {
     using value_type = remove_const_t<T>;
-    custom.value = const_cast<value_type*>(std::addressof(val));
+    // T may overload operator& e.g. std::vector<bool>::reference in libc++.
+#ifdef __cpp_if_constexpr
+    if constexpr (std::is_same<decltype(&val), T*>::value)
+      custom.value = const_cast<value_type*>(&val);
+#endif
+    if (!is_constant_evaluated())
+      custom.value = const_cast<char*>(&reinterpret_cast<const char&>(val));
     // Get the formatter type through the context to allow different contexts
     // have different extension points, e.g. `formatter<T>` for `format` and
     // `printf_formatter<T>` for `printf`.
@@ -1314,6 +1351,7 @@ template <typename Context> class value {
     parse_ctx.advance_to(f.parse(parse_ctx));
     using qualified_type =
         conditional_t<has_const_formatter<T, Context>(), const T, T>;
+    // Calling format through a mutable reference is deprecated.
     ctx.advance_to(f.format(*static_cast<qualified_type*>(arg), ctx));
   }
 };
@@ -1327,7 +1365,7 @@ using ulong_type = conditional_t<long_short, unsigned, unsigned long long>;
 template <typename T> struct format_as_result {
   template <typename U,
             FMT_ENABLE_IF(std::is_enum<U>::value || std::is_class<U>::value)>
-  static auto map(U*) -> decltype(format_as(std::declval<U>()));
+  static auto map(U*) -> remove_cvref_t<decltype(format_as(std::declval<U>()))>;
   static auto map(...) -> void;
 
   using type = decltype(map(static_cast<T*>(nullptr)));
@@ -1444,7 +1482,8 @@ template <typename Context> struct arg_mapper {
   // Only map owning types because mapping views can be unsafe.
   template <typename T, typename U = format_as_t<T>,
             FMT_ENABLE_IF(std::is_arithmetic<U>::value)>
-  FMT_CONSTEXPR FMT_INLINE auto map(const T& val) -> decltype(this->map(U())) {
+  FMT_CONSTEXPR FMT_INLINE auto map(const T& val)
+      -> decltype(FMT_DECLTYPE_THIS map(U())) {
     return map(format_as(val));
   }
 
@@ -1468,13 +1507,14 @@ template <typename Context> struct arg_mapper {
                           !is_string<U>::value && !is_char<U>::value &&
                           !is_named_arg<U>::value &&
                           !std::is_arithmetic<format_as_t<U>>::value)>
-  FMT_CONSTEXPR FMT_INLINE auto map(T& val) -> decltype(this->do_map(val)) {
+  FMT_CONSTEXPR FMT_INLINE auto map(T& val)
+      -> decltype(FMT_DECLTYPE_THIS do_map(val)) {
     return do_map(val);
   }
 
   template <typename T, FMT_ENABLE_IF(is_named_arg<T>::value)>
   FMT_CONSTEXPR FMT_INLINE auto map(const T& named_arg)
-      -> decltype(this->map(named_arg.value)) {
+      -> decltype(FMT_DECLTYPE_THIS map(named_arg.value)) {
     return map(named_arg.value);
   }
 
@@ -1493,45 +1533,19 @@ enum { max_packed_args = 62 / packed_arg_bits };
 enum : unsigned long long { is_unpacked_bit = 1ULL << 63 };
 enum : unsigned long long { has_named_args_bit = 1ULL << 62 };
 
-template <typename Char, typename InputIt>
-auto copy_str(InputIt begin, InputIt end, appender out) -> appender {
-  get_container(out).append(begin, end);
-  return out;
-}
-template <typename Char, typename InputIt>
-auto copy_str(InputIt begin, InputIt end,
-              std::back_insert_iterator<std::string> out)
-    -> std::back_insert_iterator<std::string> {
-  get_container(out).append(begin, end);
-  return out;
-}
-
-template <typename Char, typename R, typename OutputIt>
-FMT_CONSTEXPR auto copy_str(R&& rng, OutputIt out) -> OutputIt {
-  return detail::copy_str<Char>(rng.begin(), rng.end(), out);
-}
-
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
-// A workaround for gcc 4.8 to make void_t work in a SFINAE context.
-template <typename...> struct void_t_impl { using type = void; };
-template <typename... T> using void_t = typename void_t_impl<T...>::type;
-#else
-template <typename...> using void_t = void;
-#endif
-
 template <typename It, typename T, typename Enable = void>
 struct is_output_iterator : std::false_type {};
 
+template <> struct is_output_iterator<appender, char> : std::true_type {};
+
 template <typename It, typename T>
 struct is_output_iterator<
-    It, T,
-    void_t<typename std::iterator_traits<It>::iterator_category,
-           decltype(*std::declval<It>() = std::declval<T>())>>
+    It, T, void_t<decltype(*std::declval<It&>()++ = std::declval<T>())>>
     : std::true_type {};
 
 template <typename It> struct is_back_insert_iterator : std::false_type {};
 template <typename Container>
-struct is_back_insert_iterator<std::back_insert_iterator<Container>>
+struct is_back_insert_iterator<back_insert_iterator<Container>>
     : std::true_type {};
 
 // A type-erased reference to an std::locale to avoid a heavy <locale> include.
@@ -1607,8 +1621,8 @@ FMT_CONSTEXPR inline auto make_arg(T& val) -> basic_format_arg<Context> {
 }  // namespace detail
 FMT_BEGIN_EXPORT
 
-// A formatting argument. It is a trivially copyable/constructible type to
-// allow storage in basic_memory_buffer.
+// A formatting argument. Context is a template parameter for the compiled API
+// where output can be unbuffered.
 template <typename Context> class basic_format_arg {
  private:
   detail::value<Context> value_;
@@ -1618,11 +1632,6 @@ template <typename Context> class basic_format_arg {
   friend FMT_CONSTEXPR auto detail::make_arg(T& value)
       -> basic_format_arg<ContextType>;
 
-  template <typename Visitor, typename Ctx>
-  friend FMT_CONSTEXPR auto visit_format_arg(Visitor&& vis,
-                                             const basic_format_arg<Ctx>& arg)
-      -> decltype(vis(0));
-
   friend class basic_format_args<Context>;
   friend class dynamic_format_arg_store<Context>;
 
@@ -1660,55 +1669,68 @@ template <typename Context> class basic_format_arg {
   auto is_arithmetic() const -> bool {
     return detail::is_arithmetic_type(type_);
   }
+
+  /**
+    \rst
+    Visits an argument dispatching to the appropriate visit method based on
+    the argument type. For example, if the argument type is ``double`` then
+    ``vis(value)`` will be called with the value of type ``double``.
+    \endrst
+  */
+  template <typename Visitor>
+  FMT_CONSTEXPR auto visit(Visitor&& vis) -> decltype(vis(0)) {
+    switch (type_) {
+    case detail::type::none_type:
+      break;
+    case detail::type::int_type:
+      return vis(value_.int_value);
+    case detail::type::uint_type:
+      return vis(value_.uint_value);
+    case detail::type::long_long_type:
+      return vis(value_.long_long_value);
+    case detail::type::ulong_long_type:
+      return vis(value_.ulong_long_value);
+    case detail::type::int128_type:
+      return vis(detail::convert_for_visit(value_.int128_value));
+    case detail::type::uint128_type:
+      return vis(detail::convert_for_visit(value_.uint128_value));
+    case detail::type::bool_type:
+      return vis(value_.bool_value);
+    case detail::type::char_type:
+      return vis(value_.char_value);
+    case detail::type::float_type:
+      return vis(value_.float_value);
+    case detail::type::double_type:
+      return vis(value_.double_value);
+    case detail::type::long_double_type:
+      return vis(value_.long_double_value);
+    case detail::type::cstring_type:
+      return vis(value_.string.data);
+    case detail::type::string_type:
+      using sv = basic_string_view<typename Context::char_type>;
+      return vis(sv(value_.string.data, value_.string.size));
+    case detail::type::pointer_type:
+      return vis(value_.pointer);
+    case detail::type::custom_type:
+      return vis(typename basic_format_arg<Context>::handle(value_.custom));
+    }
+    return vis(monostate());
+  }
+
+  FMT_INLINE auto format_custom(const char_type* parse_begin,
+                                typename Context::parse_context_type& parse_ctx,
+                                Context& ctx) -> bool {
+    if (type_ != detail::type::custom_type) return false;
+    parse_ctx.advance_to(parse_begin);
+    value_.custom.format(value_.custom.value, parse_ctx, ctx);
+    return true;
+  }
 };
 
-/**
-  \rst
-  Visits an argument dispatching to the appropriate visit method based on
-  the argument type. For example, if the argument type is ``double`` then
-  ``vis(value)`` will be called with the value of type ``double``.
-  \endrst
- */
-// DEPRECATED!
 template <typename Visitor, typename Context>
-FMT_CONSTEXPR FMT_INLINE auto visit_format_arg(
+FMT_DEPRECATED FMT_CONSTEXPR FMT_INLINE auto visit_format_arg(
     Visitor&& vis, const basic_format_arg<Context>& arg) -> decltype(vis(0)) {
-  switch (arg.type_) {
-  case detail::type::none_type:
-    break;
-  case detail::type::int_type:
-    return vis(arg.value_.int_value);
-  case detail::type::uint_type:
-    return vis(arg.value_.uint_value);
-  case detail::type::long_long_type:
-    return vis(arg.value_.long_long_value);
-  case detail::type::ulong_long_type:
-    return vis(arg.value_.ulong_long_value);
-  case detail::type::int128_type:
-    return vis(detail::convert_for_visit(arg.value_.int128_value));
-  case detail::type::uint128_type:
-    return vis(detail::convert_for_visit(arg.value_.uint128_value));
-  case detail::type::bool_type:
-    return vis(arg.value_.bool_value);
-  case detail::type::char_type:
-    return vis(arg.value_.char_value);
-  case detail::type::float_type:
-    return vis(arg.value_.float_value);
-  case detail::type::double_type:
-    return vis(arg.value_.double_value);
-  case detail::type::long_double_type:
-    return vis(arg.value_.long_double_value);
-  case detail::type::cstring_type:
-    return vis(arg.value_.string.data);
-  case detail::type::string_type:
-    using sv = basic_string_view<typename Context::char_type>;
-    return vis(sv(arg.value_.string.data, arg.value_.string.size));
-  case detail::type::pointer_type:
-    return vis(arg.value_.pointer);
-  case detail::type::custom_type:
-    return vis(typename basic_format_arg<Context>::handle(arg.value_.custom));
-  }
-  return vis(monostate());
+  return arg.visit(std::forward<Visitor>(vis));
 }
 
 // Formatting context.
@@ -1748,8 +1770,8 @@ template <typename OutputIt, typename Char> class basic_format_context {
   }
   auto args() const -> const format_args& { return args_; }
 
-  FMT_CONSTEXPR auto error_handler() -> detail::error_handler { return {}; }
-  void on_error(const char* message) { error_handler().on_error(message); }
+  // This function is intentionally not constexpr to give a compile-time error.
+  void on_error(const char* message) { throw_format_error(message); }
 
   // Returns an iterator to the beginning of the output range.
   FMT_CONSTEXPR auto out() -> iterator { return out_; }
@@ -1831,7 +1853,7 @@ class format_arg_store
 // Arguments are taken by lvalue references to avoid some lifetime issues.
 template <typename Context = format_context, typename... T>
 constexpr auto make_format_args(T&... args)
-    -> format_arg_store<Context, remove_cvref_t<T>...> {
+    -> format_arg_store<Context, remove_const_t<T>...> {
   return {args...};
 }
 
@@ -2107,11 +2129,8 @@ struct dynamic_format_specs : format_specs<Char> {
 };
 
 // Converts a character to ASCII. Returns '\0' on conversion failure.
-template <typename Char, FMT_ENABLE_IF(std::is_integral<Char>::value)>
-constexpr auto to_ascii(Char c) -> char {
-  return c <= 0xff ? static_cast<char>(c) : '\0';
-}
-template <typename Char, FMT_ENABLE_IF(std::is_enum<Char>::value)>
+template <typename Char, FMT_ENABLE_IF(std::is_integral<Char>::value ||
+                                       std::is_enum<Char>::value)>
 constexpr auto to_ascii(Char c) -> char {
   return c <= 0xff ? static_cast<char>(c) : '\0';
 }
@@ -2156,11 +2175,11 @@ FMT_CONSTEXPR auto parse_nonnegative_int(const Char*& begin, const Char* end,
   } while (p != end && '0' <= *p && *p <= '9');
   auto num_digits = p - begin;
   begin = p;
-  if (num_digits <= std::numeric_limits<int>::digits10)
-    return static_cast<int>(value);
+  int digits10 = static_cast<int>(sizeof(int) * CHAR_BIT * 3 / 10);
+  if (num_digits <= digits10) return static_cast<int>(value);
   // Check for overflow.
-  const unsigned max = to_unsigned((std::numeric_limits<int>::max)());
-  return num_digits == std::numeric_limits<int>::digits10 + 1 &&
+  unsigned max = INT_MAX;
+  return num_digits == digits10 + 1 &&
                  prev * 10ull + unsigned(p[-1] - '0') <= max
              ? static_cast<int>(value)
              : error_value;
@@ -2188,9 +2207,8 @@ FMT_CONSTEXPR auto do_parse_arg_id(const Char* begin, const Char* end,
   Char c = *begin;
   if (c >= '0' && c <= '9') {
     int index = 0;
-    constexpr int max = (std::numeric_limits<int>::max)();
     if (c != '0')
-      index = parse_nonnegative_int(begin, end, max);
+      index = parse_nonnegative_int(begin, end, INT_MAX);
     else
       ++begin;
     if (begin == end || (*begin != '}' && *begin != ':'))
@@ -2309,9 +2327,12 @@ FMT_CONSTEXPR FMT_INLINE auto parse_format_specs(
     dynamic_format_specs<Char>& specs;
     type arg_type;
 
-    FMT_CONSTEXPR auto operator()(pres type, int set) -> const Char* {
-      if (!in(arg_type, set)) throw_format_error("invalid format specifier");
-      specs.type = type;
+    FMT_CONSTEXPR auto operator()(pres pres_type, int set) -> const Char* {
+      if (!in(arg_type, set)) {
+        if (arg_type == type::none_type) return begin;
+        throw_format_error("invalid format specifier");
+      }
+      specs.type = pres_type;
       return begin + 1;
     }
   } parse_presentation_type{begin, specs, arg_type};
@@ -2328,6 +2349,7 @@ FMT_CONSTEXPR FMT_INLINE auto parse_format_specs(
     case '+':
     case '-':
     case ' ':
+      if (arg_type == type::none_type) return begin;
       enter_state(state::sign, in(arg_type, sint_set | float_set));
       switch (c) {
       case '+':
@@ -2343,14 +2365,17 @@ FMT_CONSTEXPR FMT_INLINE auto parse_format_specs(
       ++begin;
       break;
     case '#':
+      if (arg_type == type::none_type) return begin;
       enter_state(state::hash, is_arithmetic_type(arg_type));
       specs.alt = true;
       ++begin;
       break;
     case '0':
       enter_state(state::zero);
-      if (!is_arithmetic_type(arg_type))
+      if (!is_arithmetic_type(arg_type)) {
+        if (arg_type == type::none_type) return begin;
         throw_format_error("format specifier requires numeric argument");
+      }
       if (specs.align == align::none) {
         // Ignore 0 if align is specified for compatibility with std::format.
         specs.align = align::numeric;
@@ -2372,12 +2397,14 @@ FMT_CONSTEXPR FMT_INLINE auto parse_format_specs(
       begin = parse_dynamic_spec(begin, end, specs.width, specs.width_ref, ctx);
       break;
     case '.':
+      if (arg_type == type::none_type) return begin;
       enter_state(state::precision,
                   in(arg_type, float_set | string_set | cstring_set));
       begin = parse_precision(begin, end, specs.precision, specs.precision_ref,
                               ctx);
       break;
     case 'L':
+      if (arg_type == type::none_type) return begin;
       enter_state(state::locale, is_arithmetic_type(arg_type));
       specs.localized = true;
       ++begin;
@@ -2411,6 +2438,8 @@ FMT_CONSTEXPR FMT_INLINE auto parse_format_specs(
     case 'G':
       return parse_presentation_type(pres::general_upper, float_set);
     case 'c':
+      if (arg_type == type::bool_type)
+        throw_format_error("invalid format specifier");
       return parse_presentation_type(pres::chr, integral_set);
     case 's':
       return parse_presentation_type(pres::string,
@@ -2550,9 +2579,9 @@ FMT_CONSTEXPR auto parse_format_specs(ParseContext& ctx)
       decltype(arg_mapper<context>().map(std::declval<const T&>())),
       typename strip_named_arg<T>::type>;
 // LAMMPS customization. Fails to compile with (some) Intel compilers
-#if defined(__cpp_if_constexpr) && 0
-  if constexpr (std::is_default_constructible_v<
-                    formatter<mapped_type, char_type>>) {
+#if defined(__cpp_if_constexpr) && 1
+  if constexpr (std::is_default_constructible<
+                    formatter<mapped_type, char_type>>::value) {
     return formatter<mapped_type, char_type>().parse(ctx);
   } else {
     type_is_unformattable_for<T, char_type> _;
@@ -2675,9 +2704,11 @@ void check_format_string(S format_str) {
 
 template <typename Char = char> struct vformat_args {
   using type = basic_format_args<
-      basic_format_context<std::back_insert_iterator<buffer<Char>>, Char>>;
+      basic_format_context<back_insert_iterator<buffer<Char>>, Char>>;
+};
+template <> struct vformat_args<char> {
+  using type = format_args;
 };
-template <> struct vformat_args<char> { using type = format_args; };
 
 // Use vformat_args and avoid type_identity to keep symbols short.
 template <typename Char>
@@ -2779,7 +2810,7 @@ using format_string = basic_format_string<char, type_identity_t<Args>...>;
 inline auto runtime(string_view s) -> runtime_format_string<> { return {{s}}; }
 #endif
 
-FMT_API auto vformat(string_view fmt, format_args args) -> std::string;
+FMT_API auto vformat(string_view fmt, format_args args) -> basic_string<char>;
 
 /**
   \rst
@@ -2794,7 +2825,7 @@ FMT_API auto vformat(string_view fmt, format_args args) -> std::string;
 */
 template <typename... T>
 FMT_NODISCARD FMT_INLINE auto format(format_string<T...> fmt, T&&... args)
-    -> std::string {
+    -> basic_string<char> {
   return vformat(fmt, fmt::make_format_args(args...));
 }
 
@@ -2816,7 +2847,7 @@ auto vformat_to(OutputIt out, string_view fmt, format_args args) -> OutputIt {
  **Example**::
 
    auto out = std::vector<char>();
-   fmt::format_to(std::back_inserter(out), "{}", 42);
+   fmt::format_to(fmt::back_inserter(out), "{}", 42);
  \endrst
  */
 template <typename OutputIt, typename... T,
diff --git a/src/fmt/format-inl.h b/src/fmt/format-inl.h
index dac2d437a4..8da1c17f36 100644
--- a/src/fmt/format-inl.h
+++ b/src/fmt/format-inl.h
@@ -18,7 +18,7 @@
 #  include <locale>
 #endif
 
-#ifdef _WIN32
+#if defined(_WIN32) && !defined(FMT_WINDOWS_NO_WCHAR)
 #  include <io.h>  // _isatty
 #endif
 
@@ -36,10 +36,6 @@ FMT_FUNC void assert_fail(const char* file, int line, const char* message) {
   std::terminate();
 }
 
-FMT_FUNC void throw_format_error(const char* message) {
-  FMT_THROW(format_error(message));
-}
-
 FMT_FUNC void format_error_code(detail::buffer<char>& out, int error_code,
                                 string_view message) noexcept {
   // Report error code making sure that the output fits into
@@ -58,8 +54,8 @@ FMT_FUNC void format_error_code(detail::buffer<char>& out, int error_code,
   error_code_size += detail::to_unsigned(detail::count_digits(abs_value));
   auto it = buffer_appender<char>(out);
   if (message.size() <= inline_buffer_size - error_code_size)
-    format_to(it, FMT_STRING("{}{}"), message, SEP);
-  format_to(it, FMT_STRING("{}{}"), ERROR_STR, error_code);
+    fmt::format_to(it, FMT_STRING("{}{}"), message, SEP);
+  fmt::format_to(it, FMT_STRING("{}{}"), ERROR_STR, error_code);
   FMT_ASSERT(out.size() <= inline_buffer_size, "");
 }
 
@@ -73,9 +69,8 @@ FMT_FUNC void report_error(format_func func, int error_code,
 }
 
 // A wrapper around fwrite that throws on error.
-inline void fwrite_fully(const void* ptr, size_t size, size_t count,
-                         FILE* stream) {
-  size_t written = std::fwrite(ptr, size, count, stream);
+inline void fwrite_fully(const void* ptr, size_t count, FILE* stream) {
+  size_t written = std::fwrite(ptr, 1, count, stream);
   if (written < count)
     FMT_THROW(system_error(errno, FMT_STRING("cannot write to file")));
 }
@@ -86,7 +81,7 @@ locale_ref::locale_ref(const Locale& loc) : locale_(&loc) {
   static_assert(std::is_same<Locale, std::locale>::value, "");
 }
 
-template <typename Locale> Locale locale_ref::get() const {
+template <typename Locale> auto locale_ref::get() const -> Locale {
   static_assert(std::is_same<Locale, std::locale>::value, "");
   return locale_ ? *static_cast<const std::locale*>(locale_) : std::locale();
 }
@@ -98,7 +93,8 @@ FMT_FUNC auto thousands_sep_impl(locale_ref loc) -> thousands_sep_result<Char> {
   auto thousands_sep = grouping.empty() ? Char() : facet.thousands_sep();
   return {std::move(grouping), thousands_sep};
 }
-template <typename Char> FMT_FUNC Char decimal_point_impl(locale_ref loc) {
+template <typename Char>
+FMT_FUNC auto decimal_point_impl(locale_ref loc) -> Char {
   return std::use_facet<std::numpunct<Char>>(loc.get<std::locale>())
       .decimal_point();
 }
@@ -127,6 +123,10 @@ FMT_FUNC auto write_loc(appender out, loc_value value,
 }
 }  // namespace detail
 
+FMT_FUNC void throw_format_error(const char* message) {
+  FMT_THROW(format_error(message));
+}
+
 template <typename Locale> typename Locale::id format_facet<Locale>::id;
 
 #ifndef FMT_STATIC_THOUSANDS_SEPARATOR
@@ -144,24 +144,25 @@ FMT_API FMT_FUNC auto format_facet<std::locale>::do_put(
 }
 #endif
 
-FMT_FUNC std::system_error vsystem_error(int error_code, string_view fmt,
-                                         format_args args) {
+FMT_FUNC auto vsystem_error(int error_code, string_view fmt, format_args args)
+    -> std::system_error {
   auto ec = std::error_code(error_code, std::generic_category());
   return std::system_error(ec, vformat(fmt, args));
 }
 
 namespace detail {
 
-template <typename F> inline bool operator==(basic_fp<F> x, basic_fp<F> y) {
+template <typename F>
+inline auto operator==(basic_fp<F> x, basic_fp<F> y) -> bool {
   return x.f == y.f && x.e == y.e;
 }
 
 // Compilers should be able to optimize this into the ror instruction.
-FMT_CONSTEXPR inline uint32_t rotr(uint32_t n, uint32_t r) noexcept {
+FMT_CONSTEXPR inline auto rotr(uint32_t n, uint32_t r) noexcept -> uint32_t {
   r &= 31;
   return (n >> r) | (n << (32 - r));
 }
-FMT_CONSTEXPR inline uint64_t rotr(uint64_t n, uint32_t r) noexcept {
+FMT_CONSTEXPR inline auto rotr(uint64_t n, uint32_t r) noexcept -> uint64_t {
   r &= 63;
   return (n >> r) | (n << (64 - r));
 }
@@ -170,14 +171,14 @@ FMT_CONSTEXPR inline uint64_t rotr(uint64_t n, uint32_t r) noexcept {
 namespace dragonbox {
 // Computes upper 64 bits of multiplication of a 32-bit unsigned integer and a
 // 64-bit unsigned integer.
-inline uint64_t umul96_upper64(uint32_t x, uint64_t y) noexcept {
+inline auto umul96_upper64(uint32_t x, uint64_t y) noexcept -> uint64_t {
   return umul128_upper64(static_cast<uint64_t>(x) << 32, y);
 }
 
 // Computes lower 128 bits of multiplication of a 64-bit unsigned integer and a
 // 128-bit unsigned integer.
-inline uint128_fallback umul192_lower128(uint64_t x,
-                                         uint128_fallback y) noexcept {
+inline auto umul192_lower128(uint64_t x, uint128_fallback y) noexcept
+    -> uint128_fallback {
   uint64_t high = x * y.high();
   uint128_fallback high_low = umul128(x, y.low());
   return {high + high_low.high(), high_low.low()};
@@ -185,12 +186,12 @@ inline uint128_fallback umul192_lower128(uint64_t x,
 
 // Computes lower 64 bits of multiplication of a 32-bit unsigned integer and a
 // 64-bit unsigned integer.
-inline uint64_t umul96_lower64(uint32_t x, uint64_t y) noexcept {
+inline auto umul96_lower64(uint32_t x, uint64_t y) noexcept -> uint64_t {
   return x * y;
 }
 
 // Various fast log computations.
-inline int floor_log10_pow2_minus_log10_4_over_3(int e) noexcept {
+inline auto floor_log10_pow2_minus_log10_4_over_3(int e) noexcept -> int {
   FMT_ASSERT(e <= 2936 && e >= -2985, "too large exponent");
   return (e * 631305 - 261663) >> 21;
 }
@@ -204,7 +205,7 @@ FMT_INLINE_VARIABLE constexpr struct {
 // divisible by pow(10, N).
 // Precondition: n <= pow(10, N + 1).
 template <int N>
-bool check_divisibility_and_divide_by_pow10(uint32_t& n) noexcept {
+auto check_divisibility_and_divide_by_pow10(uint32_t& n) noexcept -> bool {
   // The numbers below are chosen such that:
   //   1. floor(n/d) = floor(nm / 2^k) where d=10 or d=100,
   //   2. nm mod 2^k < m if and only if n is divisible by d,
@@ -229,7 +230,7 @@ bool check_divisibility_and_divide_by_pow10(uint32_t& n) noexcept {
 
 // Computes floor(n / pow(10, N)) for small n and N.
 // Precondition: n <= pow(10, N + 1).
-template <int N> uint32_t small_division_by_pow10(uint32_t n) noexcept {
+template <int N> auto small_division_by_pow10(uint32_t n) noexcept -> uint32_t {
   constexpr auto info = div_small_pow10_infos[N - 1];
   FMT_ASSERT(n <= info.divisor * 10, "n is too large");
   constexpr uint32_t magic_number =
@@ -238,12 +239,12 @@ template <int N> uint32_t small_division_by_pow10(uint32_t n) noexcept {
 }
 
 // Computes floor(n / 10^(kappa + 1)) (float)
-inline uint32_t divide_by_10_to_kappa_plus_1(uint32_t n) noexcept {
+inline auto divide_by_10_to_kappa_plus_1(uint32_t n) noexcept -> uint32_t {
   // 1374389535 = ceil(2^37/100)
   return static_cast<uint32_t>((static_cast<uint64_t>(n) * 1374389535) >> 37);
 }
 // Computes floor(n / 10^(kappa + 1)) (double)
-inline uint64_t divide_by_10_to_kappa_plus_1(uint64_t n) noexcept {
+inline auto divide_by_10_to_kappa_plus_1(uint64_t n) noexcept -> uint64_t {
   // 2361183241434822607 = ceil(2^(64+7)/1000)
   return umul128_upper64(n, 2361183241434822607ull) >> 7;
 }
@@ -255,7 +256,7 @@ template <> struct cache_accessor<float> {
   using carrier_uint = float_info<float>::carrier_uint;
   using cache_entry_type = uint64_t;
 
-  static uint64_t get_cached_power(int k) noexcept {
+  static auto get_cached_power(int k) noexcept -> uint64_t {
     FMT_ASSERT(k >= float_info<float>::min_k && k <= float_info<float>::max_k,
                "k is out of range");
     static constexpr const uint64_t pow10_significands[] = {
@@ -297,20 +298,23 @@ template <> struct cache_accessor<float> {
     bool is_integer;
   };
 
-  static compute_mul_result compute_mul(
-      carrier_uint u, const cache_entry_type& cache) noexcept {
+  static auto compute_mul(carrier_uint u,
+                          const cache_entry_type& cache) noexcept
+      -> compute_mul_result {
     auto r = umul96_upper64(u, cache);
     return {static_cast<carrier_uint>(r >> 32),
             static_cast<carrier_uint>(r) == 0};
   }
 
-  static uint32_t compute_delta(const cache_entry_type& cache,
-                                int beta) noexcept {
+  static auto compute_delta(const cache_entry_type& cache, int beta) noexcept
+      -> uint32_t {
     return static_cast<uint32_t>(cache >> (64 - 1 - beta));
   }
 
-  static compute_mul_parity_result compute_mul_parity(
-      carrier_uint two_f, const cache_entry_type& cache, int beta) noexcept {
+  static auto compute_mul_parity(carrier_uint two_f,
+                                 const cache_entry_type& cache,
+                                 int beta) noexcept
+      -> compute_mul_parity_result {
     FMT_ASSERT(beta >= 1, "");
     FMT_ASSERT(beta < 64, "");
 
@@ -319,22 +323,22 @@ template <> struct cache_accessor<float> {
             static_cast<uint32_t>(r >> (32 - beta)) == 0};
   }
 
-  static carrier_uint compute_left_endpoint_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta) noexcept {
+  static auto compute_left_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
     return static_cast<carrier_uint>(
         (cache - (cache >> (num_significand_bits<float>() + 2))) >>
         (64 - num_significand_bits<float>() - 1 - beta));
   }
 
-  static carrier_uint compute_right_endpoint_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta) noexcept {
+  static auto compute_right_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
     return static_cast<carrier_uint>(
         (cache + (cache >> (num_significand_bits<float>() + 1))) >>
         (64 - num_significand_bits<float>() - 1 - beta));
   }
 
-  static carrier_uint compute_round_up_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta) noexcept {
+  static auto compute_round_up_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
     return (static_cast<carrier_uint>(
                 cache >> (64 - num_significand_bits<float>() - 2 - beta)) +
             1) /
@@ -346,7 +350,7 @@ template <> struct cache_accessor<double> {
   using carrier_uint = float_info<double>::carrier_uint;
   using cache_entry_type = uint128_fallback;
 
-  static uint128_fallback get_cached_power(int k) noexcept {
+  static auto get_cached_power(int k) noexcept -> uint128_fallback {
     FMT_ASSERT(k >= float_info<double>::min_k && k <= float_info<double>::max_k,
                "k is out of range");
 
@@ -985,8 +989,7 @@ template <> struct cache_accessor<double> {
       {0xe0accfa875af45a7, 0x93eb1b80a33b8606},
       {0x8c6c01c9498d8b88, 0xbc72f130660533c4},
       {0xaf87023b9bf0ee6a, 0xeb8fad7c7f8680b5},
-      { 0xdb68c2ca82ed2a05,
-        0xa67398db9f6820e2 }
+      {0xdb68c2ca82ed2a05, 0xa67398db9f6820e2},
 #else
       {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
       {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
@@ -1071,19 +1074,22 @@ template <> struct cache_accessor<double> {
     bool is_integer;
   };
 
-  static compute_mul_result compute_mul(
-      carrier_uint u, const cache_entry_type& cache) noexcept {
+  static auto compute_mul(carrier_uint u,
+                          const cache_entry_type& cache) noexcept
+      -> compute_mul_result {
     auto r = umul192_upper128(u, cache);
     return {r.high(), r.low() == 0};
   }
 
-  static uint32_t compute_delta(cache_entry_type const& cache,
-                                int beta) noexcept {
+  static auto compute_delta(cache_entry_type const& cache, int beta) noexcept
+      -> uint32_t {
     return static_cast<uint32_t>(cache.high() >> (64 - 1 - beta));
   }
 
-  static compute_mul_parity_result compute_mul_parity(
-      carrier_uint two_f, const cache_entry_type& cache, int beta) noexcept {
+  static auto compute_mul_parity(carrier_uint two_f,
+                                 const cache_entry_type& cache,
+                                 int beta) noexcept
+      -> compute_mul_parity_result {
     FMT_ASSERT(beta >= 1, "");
     FMT_ASSERT(beta < 64, "");
 
@@ -1092,35 +1098,35 @@ template <> struct cache_accessor<double> {
             ((r.high() << beta) | (r.low() >> (64 - beta))) == 0};
   }
 
-  static carrier_uint compute_left_endpoint_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta) noexcept {
+  static auto compute_left_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
     return (cache.high() -
             (cache.high() >> (num_significand_bits<double>() + 2))) >>
            (64 - num_significand_bits<double>() - 1 - beta);
   }
 
-  static carrier_uint compute_right_endpoint_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta) noexcept {
+  static auto compute_right_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
     return (cache.high() +
             (cache.high() >> (num_significand_bits<double>() + 1))) >>
            (64 - num_significand_bits<double>() - 1 - beta);
   }
 
-  static carrier_uint compute_round_up_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta) noexcept {
+  static auto compute_round_up_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
     return ((cache.high() >> (64 - num_significand_bits<double>() - 2 - beta)) +
             1) /
            2;
   }
 };
 
-FMT_FUNC uint128_fallback get_cached_power(int k) noexcept {
+FMT_FUNC auto get_cached_power(int k) noexcept -> uint128_fallback {
   return cache_accessor<double>::get_cached_power(k);
 }
 
 // Various integer checks
 template <typename T>
-bool is_left_endpoint_integer_shorter_interval(int exponent) noexcept {
+auto is_left_endpoint_integer_shorter_interval(int exponent) noexcept -> bool {
   const int case_shorter_interval_left_endpoint_lower_threshold = 2;
   const int case_shorter_interval_left_endpoint_upper_threshold = 3;
   return exponent >= case_shorter_interval_left_endpoint_lower_threshold &&
@@ -1132,7 +1138,7 @@ FMT_INLINE int remove_trailing_zeros(uint32_t& n, int s = 0) noexcept {
   FMT_ASSERT(n != 0, "");
   // Modular inverse of 5 (mod 2^32): (mod_inv_5 * 5) mod 2^32 = 1.
   constexpr uint32_t mod_inv_5 = 0xcccccccd;
-  constexpr uint32_t mod_inv_25 = 0xc28f5c29; // = mod_inv_5 * mod_inv_5
+  constexpr uint32_t mod_inv_25 = 0xc28f5c29;  // = mod_inv_5 * mod_inv_5
 
   while (true) {
     auto q = rotr(n * mod_inv_25, 2);
@@ -1168,7 +1174,7 @@ FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept {
 
   // If n is not divisible by 10^8, work with n itself.
   constexpr uint64_t mod_inv_5 = 0xcccccccccccccccd;
-  constexpr uint64_t mod_inv_25 = 0x8f5c28f5c28f5c29; // = mod_inv_5 * mod_inv_5
+  constexpr uint64_t mod_inv_25 = 0x8f5c28f5c28f5c29;  // mod_inv_5 * mod_inv_5
 
   int s = 0;
   while (true) {
@@ -1234,7 +1240,7 @@ FMT_INLINE decimal_fp<T> shorter_interval_case(int exponent) noexcept {
   return ret_value;
 }
 
-template <typename T> decimal_fp<T> to_decimal(T x) noexcept {
+template <typename T> auto to_decimal(T x) noexcept -> decimal_fp<T> {
   // Step 1: integer promotion & Schubfach multiplier calculation.
 
   using carrier_uint = typename float_info<T>::carrier_uint;
@@ -1373,15 +1379,15 @@ template <> struct formatter<detail::bigint> {
     for (auto i = n.bigits_.size(); i > 0; --i) {
       auto value = n.bigits_[i - 1u];
       if (first) {
-        out = format_to(out, FMT_STRING("{:x}"), value);
+        out = fmt::format_to(out, FMT_STRING("{:x}"), value);
         first = false;
         continue;
       }
-      out = format_to(out, FMT_STRING("{:08x}"), value);
+      out = fmt::format_to(out, FMT_STRING("{:08x}"), value);
     }
     if (n.exp_ > 0)
-      out = format_to(out, FMT_STRING("p{}"),
-                      n.exp_ * detail::bigint::bigit_bits);
+      out = fmt::format_to(out, FMT_STRING("p{}"),
+                           n.exp_ * detail::bigint::bigit_bits);
     return out;
   }
 };
@@ -1417,7 +1423,7 @@ FMT_FUNC void report_system_error(int error_code,
   report_error(format_system_error, error_code, message);
 }
 
-FMT_FUNC std::string vformat(string_view fmt, format_args args) {
+FMT_FUNC auto vformat(string_view fmt, format_args args) -> std::string {
   // Don't optimize the "{}" case to keep the binary size small and because it
   // can be better optimized in fmt::format anyway.
   auto buffer = memory_buffer();
@@ -1426,33 +1432,38 @@ FMT_FUNC std::string vformat(string_view fmt, format_args args) {
 }
 
 namespace detail {
-#ifndef _WIN32
-FMT_FUNC bool write_console(std::FILE*, string_view) { return false; }
+#if !defined(_WIN32) || defined(FMT_WINDOWS_NO_WCHAR)
+FMT_FUNC auto write_console(int, string_view) -> bool { return false; }
 #else
 using dword = conditional_t<sizeof(long) == 4, unsigned long, unsigned>;
 extern "C" __declspec(dllimport) int __stdcall WriteConsoleW(  //
     void*, const void*, dword, dword*, void*);
 
-FMT_FUNC bool write_console(std::FILE* f, string_view text) {
-  auto fd = _fileno(f);
-  if (!_isatty(fd)) return false;
+FMT_FUNC bool write_console(int fd, string_view text) {
   auto u16 = utf8_to_utf16(text);
-  auto written = dword();
   return WriteConsoleW(reinterpret_cast<void*>(_get_osfhandle(fd)), u16.c_str(),
-                       static_cast<uint32_t>(u16.size()), &written, nullptr) != 0;
+                       static_cast<dword>(u16.size()), nullptr, nullptr) != 0;
 }
+#endif
 
+#ifdef _WIN32
 // Print assuming legacy (non-Unicode) encoding.
 FMT_FUNC void vprint_mojibake(std::FILE* f, string_view fmt, format_args args) {
   auto buffer = memory_buffer();
-  detail::vformat_to(buffer, fmt,
-                     basic_format_args<buffer_context<char>>(args));
-  fwrite_fully(buffer.data(), 1, buffer.size(), f);
+  detail::vformat_to(buffer, fmt, args);
+  fwrite_fully(buffer.data(), buffer.size(), f);
 }
 #endif
 
 FMT_FUNC void print(std::FILE* f, string_view text) {
-  if (!write_console(f, text)) fwrite_fully(text.data(), 1, text.size(), f);
+#ifdef _WIN32
+  int fd = _fileno(f);
+  if (_isatty(fd)) {
+    std::fflush(f);
+    if (write_console(fd, text)) return;
+  }
+#endif
+  fwrite_fully(text.data(), text.size(), f);
 }
 }  // namespace detail
 
diff --git a/src/fmt/format.h b/src/fmt/format.h
index 87a34b972c..8cdf95b7bd 100644
--- a/src/fmt/format.h
+++ b/src/fmt/format.h
@@ -37,17 +37,28 @@
 #include <cstdint>           // uint32_t
 #include <cstring>           // std::memcpy
 #include <initializer_list>  // std::initializer_list
-#include <limits>            // std::numeric_limits
-#include <memory>            // std::uninitialized_copy
-#include <stdexcept>         // std::runtime_error
-#include <system_error>      // std::system_error
+#include <iterator>
+#include <limits>        // std::numeric_limits
+#include <memory>        // std::uninitialized_copy
+#include <stdexcept>     // std::runtime_error
+#include <system_error>  // std::system_error
 
 #ifdef __cpp_lib_bit_cast
-#  include <bit>  // std::bitcast
+#  include <bit>  // std::bit_cast
 #endif
 
 #include "core.h"
 
+// libc++ supports string_view in pre-c++17.
+#if FMT_HAS_INCLUDE(<string_view>) && \
+    (FMT_CPLUSPLUS >= 201703L || defined(_LIBCPP_VERSION))
+#  include <string_view>
+#  define FMT_USE_STRING_VIEW
+#elif FMT_HAS_INCLUDE("experimental/string_view") && FMT_CPLUSPLUS >= 201402L
+#  include <experimental/string_view>
+#  define FMT_USE_EXPERIMENTAL_STRING_VIEW
+#endif
+
 #if defined __cpp_inline_variables && __cpp_inline_variables >= 201606L
 #  define FMT_INLINE_VARIABLE inline
 #else
@@ -65,25 +76,11 @@
 #  define FMT_FALLTHROUGH
 #endif
 
-#ifndef FMT_DEPRECATED
-#  if FMT_HAS_CPP14_ATTRIBUTE(deprecated) || FMT_MSC_VERSION >= 1900
-#    define FMT_DEPRECATED [[deprecated]]
-#  else
-#    if (defined(__GNUC__) && !defined(__LCC__)) || defined(__clang__)
-#      define FMT_DEPRECATED __attribute__((deprecated))
-#    elif FMT_MSC_VERSION
-#      define FMT_DEPRECATED __declspec(deprecated)
-#    else
-#      define FMT_DEPRECATED /* deprecated */
-#    endif
-#  endif
-#endif
-
 #ifndef FMT_NO_UNIQUE_ADDRESS
 #  if FMT_CPLUSPLUS >= 202002L
 #    if FMT_HAS_CPP_ATTRIBUTE(no_unique_address)
 #      define FMT_NO_UNIQUE_ADDRESS [[no_unique_address]]
-// VS2019 v16.10 and later except clang-cl (https://reviews.llvm.org/D110485)
+// VS2019 v16.10 and later except clang-cl (https://reviews.llvm.org/D110485).
 #    elif (FMT_MSC_VERSION >= 1929) && !FMT_CLANG_VERSION
 #      define FMT_NO_UNIQUE_ADDRESS [[msvc::no_unique_address]]
 #    endif
@@ -93,10 +90,11 @@
 #  define FMT_NO_UNIQUE_ADDRESS
 #endif
 
-#if FMT_GCC_VERSION || defined(__clang__)
-#  define FMT_VISIBILITY(value) __attribute__((visibility(value)))
+// Visibility when compiled as a shared library/object.
+#if defined(FMT_LIB_EXPORT) || defined(FMT_SHARED)
+#  define FMT_SO_VISIBILITY(value) FMT_VISIBILITY(value)
 #else
-#  define FMT_VISIBILITY(value)
+#  define FMT_SO_VISIBILITY(value)
 #endif
 
 #ifdef __has_builtin
@@ -152,7 +150,10 @@ FMT_END_NAMESPACE
 
 #ifndef FMT_USE_USER_DEFINED_LITERALS
 // EDG based compilers (Intel, NVIDIA, Elbrus, etc), GCC and MSVC support UDLs.
-#  if (FMT_HAS_FEATURE(cxx_user_literals) || FMT_GCC_VERSION >= 407 || \
+//
+// GCC before 4.9 requires a space in `operator"" _a` which is invalid in later
+// compiler versions.
+#  if (FMT_HAS_FEATURE(cxx_user_literals) || FMT_GCC_VERSION >= 409 || \
        FMT_MSC_VERSION >= 1900) &&                                     \
       (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= /* UDL feature */ 480)
 #    define FMT_USE_USER_DEFINED_LITERALS 1
@@ -272,20 +273,19 @@ inline auto ctzll(uint64_t x) -> int {
 FMT_END_NAMESPACE
 #endif
 
+namespace std {
+template <> struct iterator_traits<fmt::appender> {
+  using value_type = void;
+  using iterator_category = std::output_iterator_tag;
+};
+template <typename Container>
+struct iterator_traits<fmt::back_insert_iterator<Container>> {
+  using value_type = void;
+  using iterator_category = std::output_iterator_tag;
+};
+}  // namespace std
+
 FMT_BEGIN_NAMESPACE
-
-template <typename...> struct disjunction : std::false_type {};
-template <typename P> struct disjunction<P> : P {};
-template <typename P1, typename... Pn>
-struct disjunction<P1, Pn...>
-    : conditional_t<bool(P1::value), P1, disjunction<Pn...>> {};
-
-template <typename...> struct conjunction : std::true_type {};
-template <typename P> struct conjunction<P> : P {};
-template <typename P1, typename... Pn>
-struct conjunction<P1, Pn...>
-    : conditional_t<bool(P1::value), conjunction<Pn...>, P1> {};
-
 namespace detail {
 
 FMT_CONSTEXPR inline void abort_fuzzing_if(bool condition) {
@@ -295,6 +295,15 @@ FMT_CONSTEXPR inline void abort_fuzzing_if(bool condition) {
 #endif
 }
 
+#if defined(FMT_USE_STRING_VIEW)
+template <typename Char> using std_string_view = std::basic_string_view<Char>;
+#elif defined(FMT_USE_EXPERIMENTAL_STRING_VIEW)
+template <typename Char>
+using std_string_view = std::experimental::basic_string_view<Char>;
+#else
+template <typename T> struct std_string_view {};
+#endif
+
 template <typename CharT, CharT... C> struct string_literal {
   static constexpr CharT value[sizeof...(C)] = {C...};
   constexpr operator basic_string_view<CharT>() const {
@@ -307,37 +316,6 @@ template <typename CharT, CharT... C>
 constexpr CharT string_literal<CharT, C...>::value[sizeof...(C)];
 #endif
 
-template <typename Streambuf> class formatbuf : public Streambuf {
- private:
-  using char_type = typename Streambuf::char_type;
-  using streamsize = decltype(std::declval<Streambuf>().sputn(nullptr, 0));
-  using int_type = typename Streambuf::int_type;
-  using traits_type = typename Streambuf::traits_type;
-
-  buffer<char_type>& buffer_;
-
- public:
-  explicit formatbuf(buffer<char_type>& buf) : buffer_(buf) {}
-
- protected:
-  // The put area is always empty. This makes the implementation simpler and has
-  // the advantage that the streambuf and the buffer are always in sync and
-  // sputc never writes into uninitialized memory. A disadvantage is that each
-  // call to sputc always results in a (virtual) call to overflow. There is no
-  // disadvantage here for sputn since this always results in a call to xsputn.
-
-  auto overflow(int_type ch) -> int_type override {
-    if (!traits_type::eq_int_type(ch, traits_type::eof()))
-      buffer_.push_back(static_cast<char_type>(ch));
-    return ch;
-  }
-
-  auto xsputn(const char_type* s, streamsize count) -> streamsize override {
-    buffer_.append(s, s + count);
-    return count;
-  }
-};
-
 // Implementation of std::bit_cast for pre-C++20.
 template <typename To, typename From, FMT_ENABLE_IF(sizeof(To) == sizeof(From))>
 FMT_CONSTEXPR20 auto bit_cast(const From& from) -> To {
@@ -373,8 +351,8 @@ class uint128_fallback {
   constexpr uint128_fallback(uint64_t hi, uint64_t lo) : lo_(lo), hi_(hi) {}
   constexpr uint128_fallback(uint64_t value = 0) : lo_(value), hi_(0) {}
 
-  constexpr uint64_t high() const noexcept { return hi_; }
-  constexpr uint64_t low() const noexcept { return lo_; }
+  constexpr auto high() const noexcept -> uint64_t { return hi_; }
+  constexpr auto low() const noexcept -> uint64_t { return lo_; }
 
   template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
   constexpr explicit operator T() const {
@@ -450,7 +428,7 @@ class uint128_fallback {
     hi_ &= n.hi_;
   }
 
-  FMT_CONSTEXPR20 uint128_fallback& operator+=(uint64_t n) noexcept {
+  FMT_CONSTEXPR20 auto operator+=(uint64_t n) noexcept -> uint128_fallback& {
     if (is_constant_evaluated()) {
       lo_ += n;
       hi_ += (lo_ < n ? 1 : 0);
@@ -546,6 +524,52 @@ FMT_INLINE void assume(bool condition) {
 #endif
 }
 
+// Extracts a reference to the container from back_insert_iterator.
+template <typename Container>
+inline auto get_container(std::back_insert_iterator<Container> it)
+    -> Container& {
+  using base = std::back_insert_iterator<Container>;
+  struct accessor : base {
+    accessor(base b) : base(b) {}
+    using base::container;
+  };
+  return *accessor(it).container;
+}
+
+template <typename Char, typename InputIt, typename OutputIt>
+FMT_CONSTEXPR auto copy_str(InputIt begin, InputIt end, OutputIt out)
+    -> OutputIt {
+  while (begin != end) *out++ = static_cast<Char>(*begin++);
+  return out;
+}
+
+template <typename Char, typename T, typename U,
+          FMT_ENABLE_IF(
+              std::is_same<remove_const_t<T>, U>::value&& is_char<U>::value)>
+FMT_CONSTEXPR auto copy_str(T* begin, T* end, U* out) -> U* {
+  if (is_constant_evaluated()) return copy_str<Char, T*, U*>(begin, end, out);
+  auto size = to_unsigned(end - begin);
+  if (size > 0) memcpy(out, begin, size * sizeof(U));
+  return out + size;
+}
+
+template <typename Char, typename InputIt>
+auto copy_str(InputIt begin, InputIt end, appender out) -> appender {
+  get_container(out).append(begin, end);
+  return out;
+}
+template <typename Char, typename InputIt>
+auto copy_str(InputIt begin, InputIt end, back_insert_iterator<std::string> out)
+    -> back_insert_iterator<std::string> {
+  get_container(out).append(begin, end);
+  return out;
+}
+
+template <typename Char, typename R, typename OutputIt>
+FMT_CONSTEXPR auto copy_str(R&& rng, OutputIt out) -> OutputIt {
+  return detail::copy_str<Char>(rng.begin(), rng.end(), out);
+}
+
 // An approximation of iterator_t for pre-C++20 systems.
 template <typename T>
 using iterator_t = decltype(std::begin(std::declval<T&>()));
@@ -740,7 +764,7 @@ inline auto compute_width(basic_string_view<Char> s) -> size_t {
 }
 
 // Computes approximate display width of a UTF-8 string.
-FMT_CONSTEXPR inline size_t compute_width(string_view s) {
+FMT_CONSTEXPR inline auto compute_width(string_view s) -> size_t {
   size_t num_code_points = 0;
   // It is not a lambda for compatibility with C++14.
   struct count_code_points {
@@ -787,12 +811,17 @@ inline auto code_point_index(basic_string_view<Char> s, size_t n) -> size_t {
 
 // Calculates the index of the nth code point in a UTF-8 string.
 inline auto code_point_index(string_view s, size_t n) -> size_t {
-  const char* data = s.data();
-  size_t num_code_points = 0;
-  for (size_t i = 0, size = s.size(); i != size; ++i) {
-    if ((data[i] & 0xc0) != 0x80 && ++num_code_points > n) return i;
-  }
-  return s.size();
+  size_t result = s.size();
+  const char* begin = s.begin();
+  for_each_codepoint(s, [begin, &n, &result](uint32_t, string_view sv) {
+    if (n != 0) {
+      --n;
+      return true;
+    }
+    result = to_unsigned(sv.begin() - begin);
+    return false;
+  });
+  return result;
 }
 
 inline auto code_point_index(basic_string_view<char8_type> s, size_t n)
@@ -902,7 +931,7 @@ enum { inline_buffer_size = 500 };
   **Example**::
 
      auto out = fmt::memory_buffer();
-     format_to(std::back_inserter(out), "The answer is {}.", 42);
+     fmt::format_to(std::back_inserter(out), "The answer is {}.", 42);
 
   This will append the following output to the ``out`` object:
 
@@ -929,27 +958,29 @@ class basic_memory_buffer final : public detail::buffer<T> {
   }
 
  protected:
-  FMT_CONSTEXPR20 void grow(size_t size) override {
+  static FMT_CONSTEXPR20 void grow(detail::buffer<T>& buf, size_t size) {
     detail::abort_fuzzing_if(size > 5000);
-    const size_t max_size = std::allocator_traits<Allocator>::max_size(alloc_);
-    size_t old_capacity = this->capacity();
+    auto& self = static_cast<basic_memory_buffer&>(buf);
+    const size_t max_size =
+        std::allocator_traits<Allocator>::max_size(self.alloc_);
+    size_t old_capacity = buf.capacity();
     size_t new_capacity = old_capacity + old_capacity / 2;
     if (size > new_capacity)
       new_capacity = size;
     else if (new_capacity > max_size)
       new_capacity = size > max_size ? size : max_size;
-    T* old_data = this->data();
+    T* old_data = buf.data();
     T* new_data =
-        std::allocator_traits<Allocator>::allocate(alloc_, new_capacity);
+        std::allocator_traits<Allocator>::allocate(self.alloc_, new_capacity);
     // Suppress a bogus -Wstringop-overflow in gcc 13.1 (#3481).
-    detail::assume(this->size() <= new_capacity);
+    detail::assume(buf.size() <= new_capacity);
     // The following code doesn't throw, so the raw pointer above doesn't leak.
-    std::uninitialized_copy_n(old_data, this->size(), new_data);
-    this->set(new_data, new_capacity);
+    std::uninitialized_copy_n(old_data, buf.size(), new_data);
+    self.set(new_data, new_capacity);
     // deallocate must not throw according to the standard, but even if it does,
     // the buffer already uses the new storage and will deallocate it in
     // destructor.
-    if (old_data != store_) alloc_.deallocate(old_data, old_capacity);
+    if (old_data != self.store_) self.alloc_.deallocate(old_data, old_capacity);
   }
 
  public:
@@ -958,7 +989,7 @@ class basic_memory_buffer final : public detail::buffer<T> {
 
   FMT_CONSTEXPR20 explicit basic_memory_buffer(
       const Allocator& alloc = Allocator())
-      : alloc_(alloc) {
+      : detail::buffer<T>(grow), alloc_(alloc) {
     this->set(store_, SIZE);
     if (detail::is_constant_evaluated()) detail::fill_n(store_, SIZE, T());
   }
@@ -990,7 +1021,8 @@ class basic_memory_buffer final : public detail::buffer<T> {
     of the other object to it.
     \endrst
    */
-  FMT_CONSTEXPR20 basic_memory_buffer(basic_memory_buffer&& other) noexcept {
+  FMT_CONSTEXPR20 basic_memory_buffer(basic_memory_buffer&& other) noexcept
+      : detail::buffer<T>(grow) {
     move(other);
   }
 
@@ -1018,7 +1050,6 @@ class basic_memory_buffer final : public detail::buffer<T> {
   /** Increases the buffer capacity to *new_capacity*. */
   void reserve(size_t new_capacity) { this->try_reserve(new_capacity); }
 
-  // Directly append data into the buffer
   using detail::buffer<T>::append;
   template <typename ContiguousRange>
   void append(const ContiguousRange& range) {
@@ -1034,7 +1065,7 @@ struct is_contiguous<basic_memory_buffer<T, SIZE, Allocator>> : std::true_type {
 
 FMT_END_EXPORT
 namespace detail {
-FMT_API bool write_console(std::FILE* f, string_view text);
+FMT_API auto write_console(int fd, string_view text) -> bool;
 FMT_API void print(std::FILE*, string_view);
 }  // namespace detail
 
@@ -1046,7 +1077,7 @@ FMT_BEGIN_EXPORT
 #endif
 
 /** An error reported from a formatting function. */
-class FMT_VISIBILITY("default") format_error : public std::runtime_error {
+class FMT_SO_VISIBILITY("default") format_error : public std::runtime_error {
  public:
   using std::runtime_error::runtime_error;
 };
@@ -1089,7 +1120,7 @@ class loc_value {
   loc_value(T) {}
 
   template <typename Visitor> auto visit(Visitor&& vis) -> decltype(vis(0)) {
-    return visit_format_arg(vis, value_);
+    return value_.visit(vis);
   }
 };
 
@@ -1153,13 +1184,13 @@ using uint32_or_64_or_128_t =
 template <typename T>
 using uint64_or_128_t = conditional_t<num_bits<T>() <= 64, uint64_t, uint128_t>;
 
-#define FMT_POWERS_OF_10(factor)                                             \
-  factor * 10, (factor)*100, (factor)*1000, (factor)*10000, (factor)*100000, \
-      (factor)*1000000, (factor)*10000000, (factor)*100000000,               \
-      (factor)*1000000000
+#define FMT_POWERS_OF_10(factor)                                  \
+  factor * 10, (factor) * 100, (factor) * 1000, (factor) * 10000, \
+      (factor) * 100000, (factor) * 1000000, (factor) * 10000000, \
+      (factor) * 100000000, (factor) * 1000000000
 
 // Converts value in the range [0, 100) to a string.
-constexpr const char* digits2(size_t value) {
+constexpr auto digits2(size_t value) -> const char* {
   // GCC generates slightly better code when value is pointer-size.
   return &"0001020304050607080910111213141516171819"
          "2021222324252627282930313233343536373839"
@@ -1169,7 +1200,7 @@ constexpr const char* digits2(size_t value) {
 }
 
 // Sign is a template parameter to workaround a bug in gcc 4.8.
-template <typename Char, typename Sign> constexpr Char sign(Sign s) {
+template <typename Char, typename Sign> constexpr auto sign(Sign s) -> Char {
 #if !FMT_GCC_VERSION || FMT_GCC_VERSION >= 604
   static_assert(std::is_same<Sign, sign_t>::value, "");
 #endif
@@ -1394,7 +1425,7 @@ FMT_CONSTEXPR inline auto format_uint(It out, UInt value, int num_digits,
     return out;
   }
   // Buffer should be large enough to hold all digits (digits / BASE_BITS + 1).
-  char buffer[num_bits<UInt>() / BASE_BITS + 1];
+  char buffer[num_bits<UInt>() / BASE_BITS + 1] = {};
   format_uint<BASE_BITS>(buffer, value, num_digits, upper);
   return detail::copy_str_noinline<Char>(buffer, buffer + num_digits, out);
 }
@@ -1430,22 +1461,23 @@ template <typename WChar, typename Buffer = memory_buffer> class to_utf8 {
                                                       : "invalid utf32"));
   }
   operator string_view() const { return string_view(&buffer_[0], size()); }
-  size_t size() const { return buffer_.size() - 1; }
-  const char* c_str() const { return &buffer_[0]; }
-  std::string str() const { return std::string(&buffer_[0], size()); }
+  auto size() const -> size_t { return buffer_.size() - 1; }
+  auto c_str() const -> const char* { return &buffer_[0]; }
+  auto str() const -> std::string { return std::string(&buffer_[0], size()); }
 
   // Performs conversion returning a bool instead of throwing exception on
   // conversion error. This method may still throw in case of memory allocation
   // error.
-  bool convert(basic_string_view<WChar> s,
-               to_utf8_error_policy policy = to_utf8_error_policy::abort) {
+  auto convert(basic_string_view<WChar> s,
+               to_utf8_error_policy policy = to_utf8_error_policy::abort)
+      -> bool {
     if (!convert(buffer_, s, policy)) return false;
     buffer_.push_back(0);
     return true;
   }
-  static bool convert(
-      Buffer& buf, basic_string_view<WChar> s,
-      to_utf8_error_policy policy = to_utf8_error_policy::abort) {
+  static auto convert(Buffer& buf, basic_string_view<WChar> s,
+                      to_utf8_error_policy policy = to_utf8_error_policy::abort)
+      -> bool {
     for (auto p = s.begin(); p != s.end(); ++p) {
       uint32_t c = static_cast<uint32_t>(*p);
       if (sizeof(WChar) == 2 && c >= 0xd800 && c <= 0xdfff) {
@@ -1481,7 +1513,7 @@ template <typename WChar, typename Buffer = memory_buffer> class to_utf8 {
 };
 
 // Computes 128-bit result of multiplication of two 64-bit unsigned integers.
-inline uint128_fallback umul128(uint64_t x, uint64_t y) noexcept {
+inline auto umul128(uint64_t x, uint64_t y) noexcept -> uint128_fallback {
 #if FMT_USE_INT128
   auto p = static_cast<uint128_opt>(x) * static_cast<uint128_opt>(y);
   return {static_cast<uint64_t>(p >> 64), static_cast<uint64_t>(p)};
@@ -1512,19 +1544,19 @@ inline uint128_fallback umul128(uint64_t x, uint64_t y) noexcept {
 namespace dragonbox {
 // Computes floor(log10(pow(2, e))) for e in [-2620, 2620] using the method from
 // https://fmt.dev/papers/Dragonbox.pdf#page=28, section 6.1.
-inline int floor_log10_pow2(int e) noexcept {
+inline auto floor_log10_pow2(int e) noexcept -> int {
   FMT_ASSERT(e <= 2620 && e >= -2620, "too large exponent");
   static_assert((-1 >> 1) == -1, "right shift is not arithmetic");
   return (e * 315653) >> 20;
 }
 
-inline int floor_log2_pow10(int e) noexcept {
+inline auto floor_log2_pow10(int e) noexcept -> int {
   FMT_ASSERT(e <= 1233 && e >= -1233, "too large exponent");
   return (e * 1741647) >> 19;
 }
 
 // Computes upper 64 bits of multiplication of two 64-bit unsigned integers.
-inline uint64_t umul128_upper64(uint64_t x, uint64_t y) noexcept {
+inline auto umul128_upper64(uint64_t x, uint64_t y) noexcept -> uint64_t {
 #if FMT_USE_INT128
   auto p = static_cast<uint128_opt>(x) * static_cast<uint128_opt>(y);
   return static_cast<uint64_t>(p >> 64);
@@ -1537,14 +1569,14 @@ inline uint64_t umul128_upper64(uint64_t x, uint64_t y) noexcept {
 
 // Computes upper 128 bits of multiplication of a 64-bit unsigned integer and a
 // 128-bit unsigned integer.
-inline uint128_fallback umul192_upper128(uint64_t x,
-                                         uint128_fallback y) noexcept {
+inline auto umul192_upper128(uint64_t x, uint128_fallback y) noexcept
+    -> uint128_fallback {
   uint128_fallback r = umul128(x, y.high());
   r += umul128_upper64(x, y.low());
   return r;
 }
 
-FMT_API uint128_fallback get_cached_power(int k) noexcept;
+FMT_API auto get_cached_power(int k) noexcept -> uint128_fallback;
 
 // Type-specific information that Dragonbox uses.
 template <typename T, typename Enable = void> struct float_info;
@@ -1598,14 +1630,14 @@ template <typename T> FMT_API auto to_decimal(T x) noexcept -> decimal_fp<T>;
 }  // namespace dragonbox
 
 // Returns true iff Float has the implicit bit which is not stored.
-template <typename Float> constexpr bool has_implicit_bit() {
+template <typename Float> constexpr auto has_implicit_bit() -> bool {
   // An 80-bit FP number has a 64-bit significand an no implicit bit.
   return std::numeric_limits<Float>::digits != 64;
 }
 
 // Returns the number of significand bits stored in Float. The implicit bit is
 // not counted since it is not stored.
-template <typename Float> constexpr int num_significand_bits() {
+template <typename Float> constexpr auto num_significand_bits() -> int {
   // std::numeric_limits may not support __float128.
   return is_float128<Float>() ? 112
                               : (std::numeric_limits<Float>::digits -
@@ -1698,7 +1730,7 @@ using fp = basic_fp<unsigned long long>;
 
 // Normalizes the value converted from double and multiplied by (1 << SHIFT).
 template <int SHIFT = 0, typename F>
-FMT_CONSTEXPR basic_fp<F> normalize(basic_fp<F> value) {
+FMT_CONSTEXPR auto normalize(basic_fp<F> value) -> basic_fp<F> {
   // Handle subnormals.
   const auto implicit_bit = F(1) << num_significand_bits<double>();
   const auto shifted_implicit_bit = implicit_bit << SHIFT;
@@ -1715,7 +1747,7 @@ FMT_CONSTEXPR basic_fp<F> normalize(basic_fp<F> value) {
 }
 
 // Computes lhs * rhs / pow(2, 64) rounded to nearest with half-up tie breaking.
-FMT_CONSTEXPR inline uint64_t multiply(uint64_t lhs, uint64_t rhs) {
+FMT_CONSTEXPR inline auto multiply(uint64_t lhs, uint64_t rhs) -> uint64_t {
 #if FMT_USE_INT128
   auto product = static_cast<__uint128_t>(lhs) * rhs;
   auto f = static_cast<uint64_t>(product >> 64);
@@ -1732,33 +1764,10 @@ FMT_CONSTEXPR inline uint64_t multiply(uint64_t lhs, uint64_t rhs) {
 #endif
 }
 
-FMT_CONSTEXPR inline fp operator*(fp x, fp y) {
+FMT_CONSTEXPR inline auto operator*(fp x, fp y) -> fp {
   return {multiply(x.f, y.f), x.e + y.e + 64};
 }
 
-template <typename T = void> struct basic_data {
-  // For checking rounding thresholds.
-  // The kth entry is chosen to be the smallest integer such that the
-  // upper 32-bits of 10^(k+1) times it is strictly bigger than 5 * 10^k.
-  static constexpr uint32_t fractional_part_rounding_thresholds[8] = {
-      2576980378U,  // ceil(2^31 + 2^32/10^1)
-      2190433321U,  // ceil(2^31 + 2^32/10^2)
-      2151778616U,  // ceil(2^31 + 2^32/10^3)
-      2147913145U,  // ceil(2^31 + 2^32/10^4)
-      2147526598U,  // ceil(2^31 + 2^32/10^5)
-      2147487943U,  // ceil(2^31 + 2^32/10^6)
-      2147484078U,  // ceil(2^31 + 2^32/10^7)
-      2147483691U   // ceil(2^31 + 2^32/10^8)
-  };
-};
-// This is a struct rather than an alias to avoid shadowing warnings in gcc.
-struct data : basic_data<> {};
-
-#if FMT_CPLUSPLUS < 201703L
-template <typename T>
-constexpr uint32_t basic_data<T>::fractional_part_rounding_thresholds[];
-#endif
-
 template <typename T, bool doublish = num_bits<T>() == num_bits<double>()>
 using convert_float_result =
     conditional_t<std::is_same<T, float>::value || doublish, double, T>;
@@ -1939,15 +1948,11 @@ auto write_escaped_cp(OutputIt out, const find_escape_result<Char>& escape)
     *out++ = static_cast<Char>('\\');
     break;
   default:
-    if (escape.cp < 0x100) {
-      return write_codepoint<2, Char>(out, 'x', escape.cp);
-    }
-    if (escape.cp < 0x10000) {
+    if (escape.cp < 0x100) return write_codepoint<2, Char>(out, 'x', escape.cp);
+    if (escape.cp < 0x10000)
       return write_codepoint<4, Char>(out, 'u', escape.cp);
-    }
-    if (escape.cp < 0x110000) {
+    if (escape.cp < 0x110000)
       return write_codepoint<8, Char>(out, 'U', escape.cp);
-    }
     for (Char escape_char : basic_string_view<Char>(
              escape.begin, to_unsigned(escape.end - escape.begin))) {
       out = write_codepoint<2, Char>(out, 'x',
@@ -1977,11 +1982,13 @@ auto write_escaped_string(OutputIt out, basic_string_view<Char> str)
 
 template <typename Char, typename OutputIt>
 auto write_escaped_char(OutputIt out, Char v) -> OutputIt {
+  Char v_array[1] = {v};
   *out++ = static_cast<Char>('\'');
   if ((needs_escape(static_cast<uint32_t>(v)) && v != static_cast<Char>('"')) ||
       v == static_cast<Char>('\'')) {
-    out = write_escaped_cp(
-        out, find_escape_result<Char>{&v, &v + 1, static_cast<uint32_t>(v)});
+    out = write_escaped_cp(out,
+                           find_escape_result<Char>{v_array, v_array + 1,
+                                                    static_cast<uint32_t>(v)});
   } else {
     *out++ = v;
   }
@@ -2070,10 +2077,10 @@ template <typename Char> class digit_grouping {
     std::string::const_iterator group;
     int pos;
   };
-  next_state initial_state() const { return {grouping_.begin(), 0}; }
+  auto initial_state() const -> next_state { return {grouping_.begin(), 0}; }
 
   // Returns the next digit group separator position.
-  int next(next_state& state) const {
+  auto next(next_state& state) const -> int {
     if (thousands_sep_.empty()) return max_value<int>();
     if (state.group == grouping_.end()) return state.pos += grouping_.back();
     if (*state.group <= 0 || *state.group == max_value<char>())
@@ -2092,9 +2099,9 @@ template <typename Char> class digit_grouping {
   digit_grouping(std::string grouping, std::basic_string<Char> sep)
       : grouping_(std::move(grouping)), thousands_sep_(std::move(sep)) {}
 
-  bool has_separator() const { return !thousands_sep_.empty(); }
+  auto has_separator() const -> bool { return !thousands_sep_.empty(); }
 
-  int count_separators(int num_digits) const {
+  auto count_separators(int num_digits) const -> int {
     int count = 0;
     auto state = initial_state();
     while (num_digits > next(state)) ++count;
@@ -2103,7 +2110,7 @@ template <typename Char> class digit_grouping {
 
   // Applies grouping to digits and write the output to out.
   template <typename Out, typename C>
-  Out apply(Out out, basic_string_view<C> digits) const {
+  auto apply(Out out, basic_string_view<C> digits) const -> Out {
     auto num_digits = static_cast<int>(digits.size());
     auto separators = basic_memory_buffer<int>();
     separators.push_back(0);
@@ -2126,24 +2133,66 @@ template <typename Char> class digit_grouping {
   }
 };
 
+FMT_CONSTEXPR inline void prefix_append(unsigned& prefix, unsigned value) {
+  prefix |= prefix != 0 ? value << 8 : value;
+  prefix += (1u + (value > 0xff ? 1 : 0)) << 24;
+}
+
 // Writes a decimal integer with digit grouping.
 template <typename OutputIt, typename UInt, typename Char>
 auto write_int(OutputIt out, UInt value, unsigned prefix,
                const format_specs<Char>& specs,
                const digit_grouping<Char>& grouping) -> OutputIt {
   static_assert(std::is_same<uint64_or_128_t<UInt>, UInt>::value, "");
-  int num_digits = count_digits(value);
-  char digits[40];
-  format_decimal(digits, value, num_digits);
-  unsigned size = to_unsigned((prefix != 0 ? 1 : 0) + num_digits +
-                              grouping.count_separators(num_digits));
+  int num_digits = 0;
+  auto buffer = memory_buffer();
+  switch (specs.type) {
+  case presentation_type::none:
+  case presentation_type::dec: {
+    num_digits = count_digits(value);
+    format_decimal<char>(appender(buffer), value, num_digits);
+    break;
+  }
+  case presentation_type::hex_lower:
+  case presentation_type::hex_upper: {
+    bool upper = specs.type == presentation_type::hex_upper;
+    if (specs.alt)
+      prefix_append(prefix, unsigned(upper ? 'X' : 'x') << 8 | '0');
+    num_digits = count_digits<4>(value);
+    format_uint<4, char>(appender(buffer), value, num_digits, upper);
+    break;
+  }
+  case presentation_type::bin_lower:
+  case presentation_type::bin_upper: {
+    bool upper = specs.type == presentation_type::bin_upper;
+    if (specs.alt)
+      prefix_append(prefix, unsigned(upper ? 'B' : 'b') << 8 | '0');
+    num_digits = count_digits<1>(value);
+    format_uint<1, char>(appender(buffer), value, num_digits);
+    break;
+  }
+  case presentation_type::oct: {
+    num_digits = count_digits<3>(value);
+    // Octal prefix '0' is counted as a digit, so only add it if precision
+    // is not greater than the number of digits.
+    if (specs.alt && specs.precision <= num_digits && value != 0)
+      prefix_append(prefix, '0');
+    format_uint<3, char>(appender(buffer), value, num_digits);
+    break;
+  }
+  case presentation_type::chr:
+    return write_char(out, static_cast<Char>(value), specs);
+  default:
+    throw_format_error("invalid format specifier");
+  }
+
+  unsigned size = (prefix != 0 ? prefix >> 24 : 0) + to_unsigned(num_digits) +
+                  to_unsigned(grouping.count_separators(num_digits));
   return write_padded<align::right>(
       out, specs, size, size, [&](reserve_iterator<OutputIt> it) {
-        if (prefix != 0) {
-          char sign = static_cast<char>(prefix);
-          *it++ = static_cast<Char>(sign);
-        }
-        return grouping.apply(it, string_view(digits, to_unsigned(num_digits)));
+        for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
+          *it++ = static_cast<Char>(p & 0xff);
+        return grouping.apply(it, string_view(buffer.data(), buffer.size()));
       });
 }
 
@@ -2156,11 +2205,6 @@ inline auto write_loc(OutputIt, loc_value, const format_specs<Char>&,
   return false;
 }
 
-FMT_CONSTEXPR inline void prefix_append(unsigned& prefix, unsigned value) {
-  prefix |= prefix != 0 ? value << 8 : value;
-  prefix += (1u + (value > 0xff ? 1 : 0)) << 24;
-}
-
 template <typename UInt> struct write_int_arg {
   UInt abs_value;
   unsigned prefix;
@@ -2307,25 +2351,25 @@ class counting_iterator {
 
   FMT_CONSTEXPR counting_iterator() : count_(0) {}
 
-  FMT_CONSTEXPR size_t count() const { return count_; }
+  FMT_CONSTEXPR auto count() const -> size_t { return count_; }
 
-  FMT_CONSTEXPR counting_iterator& operator++() {
+  FMT_CONSTEXPR auto operator++() -> counting_iterator& {
     ++count_;
     return *this;
   }
-  FMT_CONSTEXPR counting_iterator operator++(int) {
+  FMT_CONSTEXPR auto operator++(int) -> counting_iterator {
     auto it = *this;
     ++*this;
     return it;
   }
 
-  FMT_CONSTEXPR friend counting_iterator operator+(counting_iterator it,
-                                                   difference_type n) {
+  FMT_CONSTEXPR friend auto operator+(counting_iterator it, difference_type n)
+      -> counting_iterator {
     it.count_ += static_cast<size_t>(n);
     return it;
   }
 
-  FMT_CONSTEXPR value_type operator*() const { return {}; }
+  FMT_CONSTEXPR auto operator*() const -> value_type { return {}; }
 };
 
 template <typename Char, typename OutputIt>
@@ -2360,9 +2404,10 @@ template <typename Char, typename OutputIt>
 FMT_CONSTEXPR auto write(OutputIt out, const Char* s,
                          const format_specs<Char>& specs, locale_ref)
     -> OutputIt {
-  return specs.type != presentation_type::pointer
-             ? write(out, basic_string_view<Char>(s), specs, {})
-             : write_ptr<Char>(out, bit_cast<uintptr_t>(s), &specs);
+  if (specs.type == presentation_type::pointer)
+    return write_ptr<Char>(out, bit_cast<uintptr_t>(s), &specs);
+  if (!s) throw_format_error("string pointer is null");
+  return write(out, basic_string_view<Char>(s), specs, {});
 }
 
 template <typename Char, typename OutputIt, typename T,
@@ -2448,9 +2493,8 @@ struct float_specs {
   bool showpoint : 1;
 };
 
-template <typename ErrorHandler = error_handler, typename Char>
-FMT_CONSTEXPR auto parse_float_type_spec(const format_specs<Char>& specs,
-                                         ErrorHandler&& eh = {})
+template <typename Char>
+FMT_CONSTEXPR auto parse_float_type_spec(const format_specs<Char>& specs)
     -> float_specs {
   auto result = float_specs();
   result.showpoint = specs.alt;
@@ -2486,7 +2530,7 @@ FMT_CONSTEXPR auto parse_float_type_spec(const format_specs<Char>& specs,
     result.format = float_format::hex;
     break;
   default:
-    eh.on_error("invalid format specifier");
+    throw_format_error("invalid format specifier");
     break;
   }
   return result;
@@ -2725,12 +2769,12 @@ template <typename Char> class fallback_digit_grouping {
  public:
   constexpr fallback_digit_grouping(locale_ref, bool) {}
 
-  constexpr bool has_separator() const { return false; }
+  constexpr auto has_separator() const -> bool { return false; }
 
-  constexpr int count_separators(int) const { return 0; }
+  constexpr auto count_separators(int) const -> int { return 0; }
 
   template <typename Out, typename C>
-  constexpr Out apply(Out out, basic_string_view<C>) const {
+  constexpr auto apply(Out out, basic_string_view<C>) const -> Out {
     return out;
   }
 };
@@ -2749,7 +2793,7 @@ FMT_CONSTEXPR20 auto write_float(OutputIt out, const DecimalFP& f,
   }
 }
 
-template <typename T> constexpr bool isnan(T value) {
+template <typename T> constexpr auto isnan(T value) -> bool {
   return !(value >= value);  // std::isnan doesn't support __float128.
 }
 
@@ -2762,14 +2806,14 @@ struct has_isfinite<T, enable_if_t<sizeof(std::isfinite(T())) != 0>>
 
 template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value&&
                                         has_isfinite<T>::value)>
-FMT_CONSTEXPR20 bool isfinite(T value) {
+FMT_CONSTEXPR20 auto isfinite(T value) -> bool {
   constexpr T inf = T(std::numeric_limits<double>::infinity());
   if (is_constant_evaluated())
     return !detail::isnan(value) && value < inf && value > -inf;
   return std::isfinite(value);
 }
 template <typename T, FMT_ENABLE_IF(!has_isfinite<T>::value)>
-FMT_CONSTEXPR bool isfinite(T value) {
+FMT_CONSTEXPR auto isfinite(T value) -> bool {
   T inf = T(std::numeric_limits<double>::infinity());
   // std::isfinite doesn't support __float128.
   return !detail::isnan(value) && value < inf && value > -inf;
@@ -2806,10 +2850,10 @@ class bigint {
   basic_memory_buffer<bigit, bigits_capacity> bigits_;
   int exp_;
 
-  FMT_CONSTEXPR20 bigit operator[](int index) const {
+  FMT_CONSTEXPR20 auto operator[](int index) const -> bigit {
     return bigits_[to_unsigned(index)];
   }
-  FMT_CONSTEXPR20 bigit& operator[](int index) {
+  FMT_CONSTEXPR20 auto operator[](int index) -> bigit& {
     return bigits_[to_unsigned(index)];
   }
 
@@ -2905,11 +2949,11 @@ class bigint {
     assign(uint64_or_128_t<Int>(n));
   }
 
-  FMT_CONSTEXPR20 int num_bigits() const {
+  FMT_CONSTEXPR20 auto num_bigits() const -> int {
     return static_cast<int>(bigits_.size()) + exp_;
   }
 
-  FMT_NOINLINE FMT_CONSTEXPR20 bigint& operator<<=(int shift) {
+  FMT_NOINLINE FMT_CONSTEXPR20 auto operator<<=(int shift) -> bigint& {
     FMT_ASSERT(shift >= 0, "");
     exp_ += shift / bigit_bits;
     shift %= bigit_bits;
@@ -2924,13 +2968,15 @@ class bigint {
     return *this;
   }
 
-  template <typename Int> FMT_CONSTEXPR20 bigint& operator*=(Int value) {
+  template <typename Int>
+  FMT_CONSTEXPR20 auto operator*=(Int value) -> bigint& {
     FMT_ASSERT(value > 0, "");
     multiply(uint32_or_64_or_128_t<Int>(value));
     return *this;
   }
 
-  friend FMT_CONSTEXPR20 int compare(const bigint& lhs, const bigint& rhs) {
+  friend FMT_CONSTEXPR20 auto compare(const bigint& lhs, const bigint& rhs)
+      -> int {
     int num_lhs_bigits = lhs.num_bigits(), num_rhs_bigits = rhs.num_bigits();
     if (num_lhs_bigits != num_rhs_bigits)
       return num_lhs_bigits > num_rhs_bigits ? 1 : -1;
@@ -2947,8 +2993,9 @@ class bigint {
   }
 
   // Returns compare(lhs1 + lhs2, rhs).
-  friend FMT_CONSTEXPR20 int add_compare(const bigint& lhs1, const bigint& lhs2,
-                                         const bigint& rhs) {
+  friend FMT_CONSTEXPR20 auto add_compare(const bigint& lhs1,
+                                          const bigint& lhs2, const bigint& rhs)
+      -> int {
     auto minimum = [](int a, int b) { return a < b ? a : b; };
     auto maximum = [](int a, int b) { return a > b ? a : b; };
     int max_lhs_bigits = maximum(lhs1.num_bigits(), lhs2.num_bigits());
@@ -3029,13 +3076,13 @@ class bigint {
     bigits_.resize(to_unsigned(num_bigits + exp_difference));
     for (int i = num_bigits - 1, j = i + exp_difference; i >= 0; --i, --j)
       bigits_[j] = bigits_[i];
-    std::uninitialized_fill_n(bigits_.data(), exp_difference, 0);
+    std::uninitialized_fill_n(bigits_.data(), exp_difference, 0u);
     exp_ -= exp_difference;
   }
 
   // Divides this bignum by divisor, assigning the remainder to this and
   // returning the quotient.
-  FMT_CONSTEXPR20 int divmod_assign(const bigint& divisor) {
+  FMT_CONSTEXPR20 auto divmod_assign(const bigint& divisor) -> int {
     FMT_ASSERT(this != &divisor, "");
     if (compare(*this, divisor) < 0) return 0;
     FMT_ASSERT(divisor.bigits_[divisor.bigits_.size() - 1u] != 0, "");
@@ -3178,8 +3225,10 @@ FMT_CONSTEXPR20 inline void format_dragon(basic_fp<uint128_t> value,
       }
       if (buf[0] == overflow) {
         buf[0] = '1';
-        if ((flags & dragon::fixed) != 0) buf.push_back('0');
-        else ++exp10;
+        if ((flags & dragon::fixed) != 0)
+          buf.push_back('0');
+        else
+          ++exp10;
       }
       return;
     }
@@ -3276,6 +3325,17 @@ FMT_CONSTEXPR20 void format_hexfloat(Float value, int precision,
   format_hexfloat(static_cast<double>(value), precision, specs, buf);
 }
 
+constexpr auto fractional_part_rounding_thresholds(int index) -> uint32_t {
+  // For checking rounding thresholds.
+  // The kth entry is chosen to be the smallest integer such that the
+  // upper 32-bits of 10^(k+1) times it is strictly bigger than 5 * 10^k.
+  // It is equal to ceil(2^31 + 2^32/10^(k + 1)).
+  // These are stored in a string literal because we cannot have static arrays
+  // in constexpr functions and non-static ones are poorly optimized.
+  return U"\x9999999a\x828f5c29\x80418938\x80068db9\x8000a7c6\x800010c7"
+         U"\x800001ae\x8000002b"[index];
+}
+
 template <typename Float>
 FMT_CONSTEXPR20 auto format_float(Float value, int precision, float_specs specs,
                                   buffer<char>& buf) -> int {
@@ -3480,12 +3540,12 @@ FMT_CONSTEXPR20 auto format_float(Float value, int precision, float_specs specs,
           //    fractional part is strictly larger than 1/2.
           if (precision < 9) {
             uint32_t fractional_part = static_cast<uint32_t>(prod);
-            should_round_up = fractional_part >=
-                                  data::fractional_part_rounding_thresholds
-                                      [8 - number_of_digits_to_print] ||
-                              ((fractional_part >> 31) &
-                               ((digits & 1) | (second_third_subsegments != 0) |
-                                has_more_segments)) != 0;
+            should_round_up =
+                fractional_part >= fractional_part_rounding_thresholds(
+                                       8 - number_of_digits_to_print) ||
+                ((fractional_part >> 31) &
+                 ((digits & 1) | (second_third_subsegments != 0) |
+                  has_more_segments)) != 0;
           }
           // Rounding at the subsegment boundary.
           // In this case, the fractional part is at least 1/2 if and only if
@@ -3520,12 +3580,12 @@ FMT_CONSTEXPR20 auto format_float(Float value, int precision, float_specs specs,
             // of 19 digits, so in this case the third segment should be
             // consisting of a genuine digit from the input.
             uint32_t fractional_part = static_cast<uint32_t>(prod);
-            should_round_up = fractional_part >=
-                                  data::fractional_part_rounding_thresholds
-                                      [8 - number_of_digits_to_print] ||
-                              ((fractional_part >> 31) &
-                               ((digits & 1) | (third_subsegment != 0) |
-                                has_more_segments)) != 0;
+            should_round_up =
+                fractional_part >= fractional_part_rounding_thresholds(
+                                       8 - number_of_digits_to_print) ||
+                ((fractional_part >> 31) &
+                 ((digits & 1) | (third_subsegment != 0) |
+                  has_more_segments)) != 0;
           }
           // Rounding at the subsegment boundary.
           else {
@@ -3726,8 +3786,7 @@ FMT_CONSTEXPR auto write(OutputIt out, Char value) -> OutputIt {
 }
 
 template <typename Char, typename OutputIt>
-FMT_CONSTEXPR_CHAR_TRAITS auto write(OutputIt out, const Char* value)
-    -> OutputIt {
+FMT_CONSTEXPR20 auto write(OutputIt out, const Char* value) -> OutputIt {
   if (value) return write(out, basic_string_view<Char>(value));
   throw_format_error("string pointer is null");
   return out;
@@ -3757,8 +3816,11 @@ template <typename Char, typename OutputIt, typename T,
 FMT_CONSTEXPR auto write(OutputIt out, const T& value)
     -> enable_if_t<mapped_type_constant<T, Context>::value == type::custom_type,
                    OutputIt> {
+  auto formatter = typename Context::template formatter_type<T>();
+  auto parse_ctx = typename Context::parse_context_type({});
+  formatter.parse(parse_ctx);
   auto ctx = Context(out, {}, {});
-  return typename Context::template formatter_type<T>().format(value, ctx);
+  return formatter.format(value, ctx);
 }
 
 // An argument visitor that formats the argument and writes it via the output
@@ -3801,62 +3863,39 @@ template <typename Char> struct arg_formatter {
   }
 };
 
-template <typename Char> struct custom_formatter {
-  basic_format_parse_context<Char>& parse_ctx;
-  buffer_context<Char>& ctx;
-
-  void operator()(
-      typename basic_format_arg<buffer_context<Char>>::handle h) const {
-    h.format(parse_ctx, ctx);
-  }
-  template <typename T> void operator()(T) const {}
-};
-
-template <typename ErrorHandler> class width_checker {
- public:
-  explicit FMT_CONSTEXPR width_checker(ErrorHandler& eh) : handler_(eh) {}
-
+struct width_checker {
   template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
   FMT_CONSTEXPR auto operator()(T value) -> unsigned long long {
-    if (is_negative(value)) handler_.on_error("negative width");
+    if (is_negative(value)) throw_format_error("negative width");
     return static_cast<unsigned long long>(value);
   }
 
   template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
   FMT_CONSTEXPR auto operator()(T) -> unsigned long long {
-    handler_.on_error("width is not integer");
+    throw_format_error("width is not integer");
     return 0;
   }
-
- private:
-  ErrorHandler& handler_;
 };
 
-template <typename ErrorHandler> class precision_checker {
- public:
-  explicit FMT_CONSTEXPR precision_checker(ErrorHandler& eh) : handler_(eh) {}
-
+struct precision_checker {
   template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
   FMT_CONSTEXPR auto operator()(T value) -> unsigned long long {
-    if (is_negative(value)) handler_.on_error("negative precision");
+    if (is_negative(value)) throw_format_error("negative precision");
     return static_cast<unsigned long long>(value);
   }
 
   template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
   FMT_CONSTEXPR auto operator()(T) -> unsigned long long {
-    handler_.on_error("precision is not integer");
+    throw_format_error("precision is not integer");
     return 0;
   }
-
- private:
-  ErrorHandler& handler_;
 };
 
-template <template <typename> class Handler, typename FormatArg,
-          typename ErrorHandler>
-FMT_CONSTEXPR auto get_dynamic_spec(FormatArg arg, ErrorHandler eh) -> int {
-  unsigned long long value = visit_format_arg(Handler<ErrorHandler>(eh), arg);
-  if (value > to_unsigned(max_value<int>())) eh.on_error("number is too big");
+template <typename Handler, typename FormatArg>
+FMT_CONSTEXPR auto get_dynamic_spec(FormatArg arg) -> int {
+  unsigned long long value = arg.visit(Handler());
+  if (value > to_unsigned(max_value<int>()))
+    throw_format_error("number is too big");
   return static_cast<int>(value);
 }
 
@@ -3867,7 +3906,7 @@ FMT_CONSTEXPR auto get_arg(Context& ctx, ID id) -> decltype(ctx.arg(id)) {
   return arg;
 }
 
-template <template <typename> class Handler, typename Context>
+template <typename Handler, typename Context>
 FMT_CONSTEXPR void handle_dynamic_spec(int& value,
                                        arg_ref<typename Context::char_type> ref,
                                        Context& ctx) {
@@ -3875,12 +3914,10 @@ FMT_CONSTEXPR void handle_dynamic_spec(int& value,
   case arg_id_kind::none:
     break;
   case arg_id_kind::index:
-    value = detail::get_dynamic_spec<Handler>(get_arg(ctx, ref.val.index),
-                                              ctx.error_handler());
+    value = detail::get_dynamic_spec<Handler>(get_arg(ctx, ref.val.index));
     break;
   case arg_id_kind::name:
-    value = detail::get_dynamic_spec<Handler>(get_arg(ctx, ref.val.name),
-                                              ctx.error_handler());
+    value = detail::get_dynamic_spec<Handler>(get_arg(ctx, ref.val.name));
     break;
   }
 }
@@ -4052,12 +4089,10 @@ class format_int {
 
 template <typename T, typename Char>
 struct formatter<T, Char, enable_if_t<detail::has_format_as<T>::value>>
-    : private formatter<detail::format_as_t<T>, Char> {
-  using base = formatter<detail::format_as_t<T>, Char>;
-  using base::parse;
-
+    : formatter<detail::format_as_t<T>, Char> {
   template <typename FormatContext>
   auto format(const T& value, FormatContext& ctx) const -> decltype(ctx.out()) {
+    using base = formatter<detail::format_as_t<T>, Char>;
     return base::format(format_as(value), ctx);
   }
 };
@@ -4198,84 +4233,59 @@ template <typename T> struct formatter<group_digits_view<T>> : formatter<T> {
   }
 };
 
-// DEPRECATED! join_view will be moved to ranges.h.
-template <typename It, typename Sentinel, typename Char = char>
-struct join_view : detail::view {
-  It begin;
-  Sentinel end;
-  basic_string_view<Char> sep;
-
-  join_view(It b, Sentinel e, basic_string_view<Char> s)
-      : begin(b), end(e), sep(s) {}
+template <typename T> struct nested_view {
+  const formatter<T>* fmt;
+  const T* value;
 };
 
-template <typename It, typename Sentinel, typename Char>
-struct formatter<join_view<It, Sentinel, Char>, Char> {
+template <typename T> struct formatter<nested_view<T>> {
+  FMT_CONSTEXPR auto parse(format_parse_context& ctx) -> const char* {
+    return ctx.begin();
+  }
+  auto format(nested_view<T> view, format_context& ctx) const
+      -> decltype(ctx.out()) {
+    return view.fmt->format(*view.value, ctx);
+  }
+};
+
+template <typename T> struct nested_formatter {
  private:
-  using value_type =
-#ifdef __cpp_lib_ranges
-      std::iter_value_t<It>;
-#else
-      typename std::iterator_traits<It>::value_type;
-#endif
-  formatter<remove_cvref_t<value_type>, Char> value_formatter_;
+  int width_;
+  detail::fill_t<char> fill_;
+  align_t align_ : 4;
+  formatter<T> formatter_;
 
  public:
-  template <typename ParseContext>
-  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const Char* {
-    return value_formatter_.parse(ctx);
+  constexpr nested_formatter() : width_(0), align_(align_t::none) {}
+
+  FMT_CONSTEXPR auto parse(format_parse_context& ctx) -> const char* {
+    auto specs = detail::dynamic_format_specs<char>();
+    auto it = parse_format_specs(ctx.begin(), ctx.end(), specs, ctx,
+                                 detail::type::none_type);
+    width_ = specs.width;
+    fill_ = specs.fill;
+    align_ = specs.align;
+    ctx.advance_to(it);
+    return formatter_.parse(ctx);
   }
 
-  template <typename FormatContext>
-  auto format(const join_view<It, Sentinel, Char>& value,
-              FormatContext& ctx) const -> decltype(ctx.out()) {
-    auto it = value.begin;
-    auto out = ctx.out();
-    if (it != value.end) {
-      out = value_formatter_.format(*it, ctx);
-      ++it;
-      while (it != value.end) {
-        out = detail::copy_str<Char>(value.sep.begin(), value.sep.end(), out);
-        ctx.advance_to(out);
-        out = value_formatter_.format(*it, ctx);
-        ++it;
-      }
-    }
-    return out;
+  template <typename F>
+  auto write_padded(format_context& ctx, F write) const -> decltype(ctx.out()) {
+    if (width_ == 0) return write(ctx.out());
+    auto buf = memory_buffer();
+    write(std::back_inserter(buf));
+    auto specs = format_specs<>();
+    specs.width = width_;
+    specs.fill = fill_;
+    specs.align = align_;
+    return detail::write(ctx.out(), string_view(buf.data(), buf.size()), specs);
+  }
+
+  auto nested(const T& value) const -> nested_view<T> {
+    return nested_view<T>{&formatter_, &value};
   }
 };
 
-/**
-  Returns a view that formats the iterator range `[begin, end)` with elements
-  separated by `sep`.
- */
-template <typename It, typename Sentinel>
-auto join(It begin, Sentinel end, string_view sep) -> join_view<It, Sentinel> {
-  return {begin, end, sep};
-}
-
-/**
-  \rst
-  Returns a view that formats `range` with elements separated by `sep`.
-
-  **Example**::
-
-    std::vector<int> v = {1, 2, 3};
-    fmt::print("{}", fmt::join(v, ", "));
-    // Output: "1, 2, 3"
-
-  ``fmt::join`` applies passed format specifiers to the range elements::
-
-    fmt::print("{:02}", fmt::join(v, ", "));
-    // Output: "01, 02, 03"
-  \endrst
- */
-template <typename Range>
-auto join(Range&& range, string_view sep)
-    -> join_view<detail::iterator_t<Range>, detail::sentinel_t<Range>> {
-  return join(std::begin(range), std::end(range), sep);
-}
-
 /**
   \rst
   Converts *value* to ``std::string`` using the default format for type *T*.
@@ -4329,12 +4339,12 @@ void vformat_to(buffer<Char>& buf, basic_string_view<Char> fmt,
   auto out = buffer_appender<Char>(buf);
   if (fmt.size() == 2 && equal2(fmt.data(), "{}")) {
     auto arg = args.get(0);
-    if (!arg) error_handler().on_error("argument not found");
-    visit_format_arg(default_arg_formatter<Char>{out, args, loc}, arg);
+    if (!arg) throw_format_error("argument not found");
+    arg.visit(default_arg_formatter<Char>{out, args, loc});
     return;
   }
 
-  struct format_handler : error_handler {
+  struct format_handler {
     basic_format_parse_context<Char> parse_context;
     buffer_context<Char> context;
 
@@ -4356,26 +4366,22 @@ void vformat_to(buffer<Char>& buf, basic_string_view<Char> fmt,
     }
     FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
       int arg_id = context.arg_id(id);
-      if (arg_id < 0) on_error("argument not found");
+      if (arg_id < 0) throw_format_error("argument not found");
       return arg_id;
     }
 
     FMT_INLINE void on_replacement_field(int id, const Char*) {
       auto arg = get_arg(context, id);
-      context.advance_to(visit_format_arg(
-          default_arg_formatter<Char>{context.out(), context.args(),
-                                      context.locale()},
-          arg));
+      context.advance_to(arg.visit(default_arg_formatter<Char>{
+          context.out(), context.args(), context.locale()}));
     }
 
     auto on_format_specs(int id, const Char* begin, const Char* end)
         -> const Char* {
       auto arg = get_arg(context, id);
-      if (arg.type() == type::custom_type) {
-        parse_context.advance_to(begin);
-        visit_format_arg(custom_formatter<Char>{parse_context, context}, arg);
+      // Not using a visitor for custom types gives better codegen.
+      if (arg.format_custom(begin, parse_context, context))
         return parse_context.begin();
-      }
       auto specs = detail::dynamic_format_specs<Char>();
       begin = parse_format_specs(begin, end, specs, parse_context, arg.type());
       detail::handle_dynamic_spec<detail::width_checker>(
@@ -4383,11 +4389,13 @@ void vformat_to(buffer<Char>& buf, basic_string_view<Char> fmt,
       detail::handle_dynamic_spec<detail::precision_checker>(
           specs.precision, specs.precision_ref, context);
       if (begin == end || *begin != '}')
-        on_error("missing '}' in format string");
-      auto f = arg_formatter<Char>{context.out(), specs, context.locale()};
-      context.advance_to(visit_format_arg(f, arg));
+        throw_format_error("missing '}' in format string");
+      context.advance_to(arg.visit(
+          arg_formatter<Char>{context.out(), specs, context.locale()}));
       return begin;
     }
+
+    void on_error(const char* message) { throw_format_error(message); }
   };
   detail::parse_format_string<false>(fmt, format_handler(out, fmt, args, loc));
 }
@@ -4426,7 +4434,7 @@ template <detail_exported::fixed_string Str> constexpr auto operator""_a() {
   return detail::udl_arg<char_t, sizeof(Str.data) / sizeof(char_t), Str>();
 }
 #  else
-constexpr auto operator"" _a(const char* s, size_t) -> detail::udl_arg<char> {
+constexpr auto operator""_a(const char* s, size_t) -> detail::udl_arg<char> {
   return {s};
 }
 #  endif
@@ -4486,16 +4494,16 @@ formatter<T, Char,
                       detail::type::custom_type>>::format(const T& val,
                                                           FormatContext& ctx)
     const -> decltype(ctx.out()) {
-  if (specs_.width_ref.kind != detail::arg_id_kind::none ||
-      specs_.precision_ref.kind != detail::arg_id_kind::none) {
-    auto specs = specs_;
-    detail::handle_dynamic_spec<detail::width_checker>(specs.width,
-                                                       specs.width_ref, ctx);
-    detail::handle_dynamic_spec<detail::precision_checker>(
-        specs.precision, specs.precision_ref, ctx);
-    return detail::write<Char>(ctx.out(), val, specs, ctx.locale());
+  if (specs_.width_ref.kind == detail::arg_id_kind::none &&
+      specs_.precision_ref.kind == detail::arg_id_kind::none) {
+    return detail::write<Char>(ctx.out(), val, specs_, ctx.locale());
   }
-  return detail::write<Char>(ctx.out(), val, specs_, ctx.locale());
+  auto specs = specs_;
+  detail::handle_dynamic_spec<detail::width_checker>(specs.width,
+                                                     specs.width_ref, ctx);
+  detail::handle_dynamic_spec<detail::precision_checker>(
+      specs.precision, specs.precision_ref, ctx);
+  return detail::write<Char>(ctx.out(), val, specs, ctx.locale());
 }
 
 FMT_END_NAMESPACE
diff --git a/src/fmt/os.h b/src/fmt/os.h
index 2126424d39..6009ccc112 100644
--- a/src/fmt/os.h
+++ b/src/fmt/os.h
@@ -13,12 +13,14 @@
 #include <cstdio>
 #include <system_error>  // std::system_error
 
-#if defined __APPLE__ || defined(__FreeBSD__)
-#  include <xlocale.h>  // for LC_NUMERIC_MASK on OS X
-#endif
-
 #include "format.h"
 
+#if defined __APPLE__ || defined(__FreeBSD__)
+#  if FMT_HAS_INCLUDE(<xlocale.h>)
+#    include <xlocale.h>  // for LC_NUMERIC_MASK on OS X
+#  endif
+#endif
+
 #ifndef FMT_USE_FCNTL
 // UWP doesn't provide _pipe.
 #  if FMT_HAS_INCLUDE("winapifamily.h")
@@ -46,6 +48,7 @@
 
 // Calls to system functions are wrapped in FMT_SYSTEM for testability.
 #ifdef FMT_SYSTEM
+#  define FMT_HAS_SYSTEM
 #  define FMT_POSIX_CALL(call) FMT_SYSTEM(call)
 #else
 #  define FMT_SYSTEM(call) ::call
@@ -114,7 +117,7 @@ template <typename Char> class basic_cstring_view {
   basic_cstring_view(const std::basic_string<Char>& s) : data_(s.c_str()) {}
 
   /** Returns the pointer to a C string. */
-  const Char* c_str() const { return data_; }
+  auto c_str() const -> const Char* { return data_; }
 };
 
 using cstring_view = basic_cstring_view<char>;
@@ -169,7 +172,7 @@ std::system_error windows_error(int error_code, string_view message,
 // Can be used to report errors from destructors.
 FMT_API void report_windows_error(int error_code, const char* message) noexcept;
 #else
-inline const std::error_category& system_category() noexcept {
+inline auto system_category() noexcept -> const std::error_category& {
   return std::system_category();
 }
 #endif  // _WIN32
@@ -206,7 +209,7 @@ class buffered_file {
     other.file_ = nullptr;
   }
 
-  buffered_file& operator=(buffered_file&& other) {
+  auto operator=(buffered_file&& other) -> buffered_file& {
     close();
     file_ = other.file_;
     other.file_ = nullptr;
@@ -220,9 +223,9 @@ class buffered_file {
   FMT_API void close();
 
   // Returns the pointer to a FILE object representing this file.
-  FILE* get() const noexcept { return file_; }
+  auto get() const noexcept -> FILE* { return file_; }
 
-  FMT_API int descriptor() const;
+  FMT_API auto descriptor() const -> int;
 
   void vprint(string_view format_str, format_args args) {
     fmt::vprint(file_, format_str, args);
@@ -235,6 +238,7 @@ class buffered_file {
 };
 
 #if FMT_USE_FCNTL
+
 // A file. Closed file is represented by a file object with descriptor -1.
 // Methods that are not declared with noexcept may throw
 // fmt::system_error in case of failure. Note that some errors such as
@@ -248,6 +252,8 @@ class FMT_API file {
   // Constructs a file object with a given descriptor.
   explicit file(int fd) : fd_(fd) {}
 
+  friend struct pipe;
+
  public:
   // Possible values for the oflag argument to the constructor.
   enum {
@@ -272,7 +278,7 @@ class FMT_API file {
   file(file&& other) noexcept : fd_(other.fd_) { other.fd_ = -1; }
 
   // Move assignment is not noexcept because close may throw.
-  file& operator=(file&& other) {
+  auto operator=(file&& other) -> file& {
     close();
     fd_ = other.fd_;
     other.fd_ = -1;
@@ -283,24 +289,24 @@ class FMT_API file {
   ~file() noexcept;
 
   // Returns the file descriptor.
-  int descriptor() const noexcept { return fd_; }
+  auto descriptor() const noexcept -> int { return fd_; }
 
   // Closes the file.
   void close();
 
   // Returns the file size. The size has signed type for consistency with
   // stat::st_size.
-  long long size() const;
+  auto size() const -> long long;
 
   // Attempts to read count bytes from the file into the specified buffer.
-  size_t read(void* buffer, size_t count);
+  auto read(void* buffer, size_t count) -> size_t;
 
   // Attempts to write count bytes from the specified buffer to the file.
-  size_t write(const void* buffer, size_t count);
+  auto write(const void* buffer, size_t count) -> size_t;
 
   // Duplicates a file descriptor with the dup function and returns
   // the duplicate as a file object.
-  static file dup(int fd);
+  static auto dup(int fd) -> file;
 
   // Makes fd be the copy of this file descriptor, closing fd first if
   // necessary.
@@ -310,13 +316,9 @@ class FMT_API file {
   // necessary.
   void dup2(int fd, std::error_code& ec) noexcept;
 
-  // Creates a pipe setting up read_end and write_end file objects for reading
-  // and writing respectively.
-  static void pipe(file& read_end, file& write_end);
-
   // Creates a buffered_file object associated with this file and detaches
   // this file object from the file.
-  buffered_file fdopen(const char* mode);
+  auto fdopen(const char* mode) -> buffered_file;
 
 #  if defined(_WIN32) && !defined(__MINGW32__)
   // Opens a file and constructs a file object representing this file by
@@ -325,15 +327,24 @@ class FMT_API file {
 #  endif
 };
 
+struct FMT_API pipe {
+  file read_end;
+  file write_end;
+
+  // Creates a pipe setting up read_end and write_end file objects for reading
+  // and writing respectively.
+  pipe();
+};
+
 // Returns the memory page size.
-long getpagesize();
+auto getpagesize() -> long;
 
 namespace detail {
 
 struct buffer_size {
   buffer_size() = default;
   size_t value = 0;
-  buffer_size operator=(size_t val) const {
+  auto operator=(size_t val) const -> buffer_size {
     auto bs = buffer_size();
     bs.value = val;
     return bs;
@@ -366,9 +377,10 @@ struct ostream_params {
 };
 
 class file_buffer final : public buffer<char> {
+ private:
   file file_;
 
-  FMT_API void grow(size_t) override;
+  FMT_API static void grow(buffer<char>& buf, size_t);
 
  public:
   FMT_API file_buffer(cstring_view path, const ostream_params& params);
@@ -410,7 +422,7 @@ class FMT_API ostream {
   void flush() { buffer_.flush(); }
 
   template <typename... T>
-  friend ostream output_file(cstring_view path, T... params);
+  friend auto output_file(cstring_view path, T... params) -> ostream;
 
   void close() { buffer_.close(); }
 
@@ -419,7 +431,7 @@ class FMT_API ostream {
     output to the file.
    */
   template <typename... T> void print(format_string<T...> fmt, T&&... args) {
-    vformat_to(detail::buffer_appender<char>(buffer_), fmt,
+    vformat_to(std::back_inserter(buffer_), fmt,
                fmt::make_format_args(args...));
   }
 };
@@ -440,7 +452,7 @@ class FMT_API ostream {
   \endrst
  */
 template <typename... T>
-inline ostream output_file(cstring_view path, T... params) {
+inline auto output_file(cstring_view path, T... params) -> ostream {
   return {path, detail::ostream_params(params...)};
 }
 #endif  // FMT_USE_FCNTL
diff --git a/src/fmt/ostream.h b/src/fmt/ostream.h
index a112fe7ba9..26fb3b5ac0 100644
--- a/src/fmt/ostream.h
+++ b/src/fmt/ostream.h
@@ -10,19 +10,50 @@
 
 #include <fstream>  // std::filebuf
 
-#if defined(_WIN32) && defined(__GLIBCXX__)
-#  include <ext/stdio_filebuf.h>
-#  include <ext/stdio_sync_filebuf.h>
-#elif defined(_WIN32) && defined(_LIBCPP_VERSION)
-#  include <__std_stream>
+#ifdef _WIN32
+#  ifdef __GLIBCXX__
+#    include <ext/stdio_filebuf.h>
+#    include <ext/stdio_sync_filebuf.h>
+#  endif
+#  include <io.h>
 #endif
 
 #include "format.h"
 
 FMT_BEGIN_NAMESPACE
-
 namespace detail {
 
+template <typename Streambuf> class formatbuf : public Streambuf {
+ private:
+  using char_type = typename Streambuf::char_type;
+  using streamsize = decltype(std::declval<Streambuf>().sputn(nullptr, 0));
+  using int_type = typename Streambuf::int_type;
+  using traits_type = typename Streambuf::traits_type;
+
+  buffer<char_type>& buffer_;
+
+ public:
+  explicit formatbuf(buffer<char_type>& buf) : buffer_(buf) {}
+
+ protected:
+  // The put area is always empty. This makes the implementation simpler and has
+  // the advantage that the streambuf and the buffer are always in sync and
+  // sputc never writes into uninitialized memory. A disadvantage is that each
+  // call to sputc always results in a (virtual) call to overflow. There is no
+  // disadvantage here for sputn since this always results in a call to xsputn.
+
+  auto overflow(int_type ch) -> int_type override {
+    if (!traits_type::eq_int_type(ch, traits_type::eof()))
+      buffer_.push_back(static_cast<char_type>(ch));
+    return ch;
+  }
+
+  auto xsputn(const char_type* s, streamsize count) -> streamsize override {
+    buffer_.append(s, s + count);
+    return count;
+  }
+};
+
 // Generate a unique explicit instantion in every translation unit using a tag
 // type in an anonymous namespace.
 namespace {
@@ -37,36 +68,40 @@ class file_access {
 template class file_access<file_access_tag, std::filebuf,
                            &std::filebuf::_Myfile>;
 auto get_file(std::filebuf&) -> FILE*;
-#elif defined(_WIN32) && defined(_LIBCPP_VERSION)
-template class file_access<file_access_tag, std::__stdoutbuf<char>,
-                           &std::__stdoutbuf<char>::__file_>;
-auto get_file(std::__stdoutbuf<char>&) -> FILE*;
 #endif
 
-inline bool write_ostream_unicode(std::ostream& os, fmt::string_view data) {
+inline auto write_ostream_unicode(std::ostream& os, fmt::string_view data)
+    -> bool {
+  FILE* f = nullptr;
 #if FMT_MSC_VERSION
   if (auto* buf = dynamic_cast<std::filebuf*>(os.rdbuf()))
-    if (FILE* f = get_file(*buf)) return write_console(f, data);
-#elif defined(_WIN32) && defined(__GLIBCXX__)
-  auto* rdbuf = os.rdbuf();
-  FILE* c_file;
-  if (auto* sfbuf = dynamic_cast<__gnu_cxx::stdio_sync_filebuf<char>*>(rdbuf))
-    c_file = sfbuf->file();
-  else if (auto* fbuf = dynamic_cast<__gnu_cxx::stdio_filebuf<char>*>(rdbuf))
-    c_file = fbuf->file();
+    f = get_file(*buf);
+  else
+    return false;
+#elif defined(_WIN32) && defined(__GLIBCXX__)
+  auto* rdbuf = os.rdbuf();
+  if (auto* sfbuf = dynamic_cast<__gnu_cxx::stdio_sync_filebuf<char>*>(rdbuf))
+    f = sfbuf->file();
+  else if (auto* fbuf = dynamic_cast<__gnu_cxx::stdio_filebuf<char>*>(rdbuf))
+    f = fbuf->file();
   else
     return false;
-  if (c_file) return write_console(c_file, data);
-#elif defined(_WIN32) && defined(_LIBCPP_VERSION)
-  if (auto* buf = dynamic_cast<std::__stdoutbuf<char>*>(os.rdbuf()))
-    if (FILE* f = get_file(*buf)) return write_console(f, data);
 #else
-  ignore_unused(os, data);
+  ignore_unused(os, data, f);
+#endif
+#ifdef _WIN32
+  if (f) {
+    int fd = _fileno(f);
+    if (_isatty(fd)) {
+      os.flush();
+      return write_console(fd, data);
+    }
+  }
 #endif
   return false;
 }
-inline bool write_ostream_unicode(std::wostream&,
-                                  fmt::basic_string_view<wchar_t>) {
+inline auto write_ostream_unicode(std::wostream&,
+                                  fmt::basic_string_view<wchar_t>) -> bool {
   return false;
 }
 
@@ -87,18 +122,19 @@ void write_buffer(std::basic_ostream<Char>& os, buffer<Char>& buf) {
 }
 
 template <typename Char, typename T>
-void format_value(buffer<Char>& buf, const T& value,
-                  locale_ref loc = locale_ref()) {
+void format_value(buffer<Char>& buf, const T& value) {
   auto&& format_buf = formatbuf<std::basic_streambuf<Char>>(buf);
   auto&& output = std::basic_ostream<Char>(&format_buf);
 #if !defined(FMT_STATIC_THOUSANDS_SEPARATOR)
-  if (loc) output.imbue(loc.get<std::locale>());
+  output.imbue(std::locale::classic());  // The default is always unlocalized.
 #endif
   output << value;
   output.exceptions(std::ios_base::failbit | std::ios_base::badbit);
 }
 
-template <typename T> struct streamed_view { const T& value; };
+template <typename T> struct streamed_view {
+  const T& value;
+};
 
 }  // namespace detail
 
@@ -111,7 +147,7 @@ struct basic_ostream_formatter : formatter<basic_string_view<Char>, Char> {
   auto format(const T& value, basic_format_context<OutputIt, Char>& ctx) const
       -> OutputIt {
     auto buffer = basic_memory_buffer<Char>();
-    detail::format_value(buffer, value, ctx.locale());
+    detail::format_value(buffer, value);
     return formatter<basic_string_view<Char>, Char>::format(
         {buffer.data(), buffer.size()}, ctx);
   }
@@ -140,7 +176,7 @@ struct formatter<detail::streamed_view<T>, Char>
   \endrst
  */
 template <typename T>
-auto streamed(const T& value) -> detail::streamed_view<T> {
+constexpr auto streamed(const T& value) -> detail::streamed_view<T> {
   return {value};
 }
 
diff --git a/src/fmt/printf.h b/src/fmt/printf.h
index adef6adf83..35445abce2 100644
--- a/src/fmt/printf.h
+++ b/src/fmt/printf.h
@@ -16,13 +16,19 @@
 FMT_BEGIN_NAMESPACE
 FMT_BEGIN_EXPORT
 
-template <typename T> struct printf_formatter { printf_formatter() = delete; };
+template <typename T> struct printf_formatter {
+  printf_formatter() = delete;
+};
 
 template <typename Char> class basic_printf_context {
  private:
   detail::buffer_appender<Char> out_;
   basic_format_args<basic_printf_context> args_;
 
+  static_assert(std::is_same<Char, char>::value ||
+                    std::is_same<Char, wchar_t>::value,
+                "Unsupported code unit type.");
+
  public:
   using char_type = Char;
   using parse_context_type = basic_format_parse_context<Char>;
@@ -47,9 +53,7 @@ template <typename Char> class basic_printf_context {
     return args_.get(id);
   }
 
-  FMT_CONSTEXPR void on_error(const char* message) {
-    detail::error_handler().on_error(message);
-  }
+  void on_error(const char* message) { throw_format_error(message); }
 };
 
 namespace detail {
@@ -102,7 +106,9 @@ struct is_zero_int {
 
 template <typename T> struct make_unsigned_or_bool : std::make_unsigned<T> {};
 
-template <> struct make_unsigned_or_bool<bool> { using type = bool; };
+template <> struct make_unsigned_or_bool<bool> {
+  using type = bool;
+};
 
 template <typename T, typename Context> class arg_converter {
  private:
@@ -157,7 +163,7 @@ template <typename T, typename Context> class arg_converter {
 // unsigned).
 template <typename T, typename Context, typename Char>
 void convert_arg(basic_format_arg<Context>& arg, Char type) {
-  visit_format_arg(arg_converter<T, Context>(arg, type), arg);
+  arg.visit(arg_converter<T, Context>(arg, type));
 }
 
 // Converts an integer argument to char for printf.
@@ -360,8 +366,8 @@ auto parse_header(const Char*& it, const Char* end, format_specs<Char>& specs,
       if (specs.width == -1) throw_format_error("number is too big");
     } else if (*it == '*') {
       ++it;
-      specs.width = static_cast<int>(visit_format_arg(
-          detail::printf_width_handler<Char>(specs), get_arg(-1)));
+      specs.width = static_cast<int>(
+          get_arg(-1).visit(detail::printf_width_handler<Char>(specs)));
     }
   }
   return arg_index;
@@ -456,8 +462,8 @@ void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
         specs.precision = parse_nonnegative_int(it, end, 0);
       } else if (c == '*') {
         ++it;
-        specs.precision = static_cast<int>(
-            visit_format_arg(printf_precision_handler(), get_arg(-1)));
+        specs.precision =
+            static_cast<int>(get_arg(-1).visit(printf_precision_handler()));
       } else {
         specs.precision = 0;
       }
@@ -471,14 +477,14 @@ void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
       specs.fill[0] = ' ';
     }
     if (specs.precision >= 0 && arg.type() == type::cstring_type) {
-      auto str = visit_format_arg(get_cstring<Char>(), arg);
+      auto str = arg.visit(get_cstring<Char>());
       auto str_end = str + specs.precision;
       auto nul = std::find(str, str_end, Char());
       auto sv = basic_string_view<Char>(
           str, to_unsigned(nul != str_end ? nul - str : specs.precision));
       arg = make_arg<basic_printf_context<Char>>(sv);
     }
-    if (specs.alt && visit_format_arg(is_zero_int(), arg)) specs.alt = false;
+    if (specs.alt && arg.visit(is_zero_int())) specs.alt = false;
     if (specs.fill[0] == '0') {
       if (arg.is_arithmetic() && specs.align != align::left)
         specs.align = align::numeric;
@@ -538,7 +544,7 @@ void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
         type = 'd';
         break;
       case 'c':
-        visit_format_arg(char_converter<basic_printf_context<Char>>(arg), arg);
+        arg.visit(char_converter<basic_printf_context<Char>>(arg));
         break;
       }
     }
@@ -549,7 +555,7 @@ void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
     start = it;
 
     // Format argument.
-    visit_format_arg(printf_arg_formatter<Char>(out, specs, context), arg);
+    arg.visit(printf_arg_formatter<Char>(out, specs, context));
   }
   write(out, basic_string_view<Char>(start, to_unsigned(it - start)));
 }
diff --git a/src/fmt/ranges.h b/src/fmt/ranges.h
index 65beba5bfc..a9cd60e594 100644
--- a/src/fmt/ranges.h
+++ b/src/fmt/ranges.h
@@ -1,13 +1,9 @@
-// Formatting library for C++ - experimental range support
+// Formatting library for C++ - range and tuple support
 //
-// Copyright (c) 2012 - present, Victor Zverovich
+// Copyright (c) 2012 - present, Victor Zverovich and {fmt} contributors
 // All rights reserved.
 //
 // For the license information refer to format.h.
-//
-// Copyright (c) 2018 - present, Remotion (Igor Schulz)
-// All Rights Reserved
-// {fmt} support for ranges, containers and types tuple interface.
 
 #ifndef FMT_RANGES_H_
 #define FMT_RANGES_H_
@@ -187,7 +183,7 @@ template <size_t N> using make_index_sequence = std::make_index_sequence<N>;
 template <typename T, T... N> struct integer_sequence {
   using value_type = T;
 
-  static FMT_CONSTEXPR size_t size() { return sizeof...(N); }
+  static FMT_CONSTEXPR auto size() -> size_t { return sizeof...(N); }
 };
 
 template <size_t... N> using index_sequence = integer_sequence<size_t, N...>;
@@ -211,15 +207,15 @@ class is_tuple_formattable_ {
 };
 template <typename T, typename C> class is_tuple_formattable_<T, C, true> {
   template <std::size_t... Is>
-  static std::true_type check2(index_sequence<Is...>,
-                               integer_sequence<bool, (Is == Is)...>);
-  static std::false_type check2(...);
+  static auto check2(index_sequence<Is...>,
+                     integer_sequence<bool, (Is == Is)...>) -> std::true_type;
+  static auto check2(...) -> std::false_type;
   template <std::size_t... Is>
-  static decltype(check2(
+  static auto check(index_sequence<Is...>) -> decltype(check2(
       index_sequence<Is...>{},
-      integer_sequence<
-          bool, (is_formattable<typename std::tuple_element<Is, T>::type,
-                                C>::value)...>{})) check(index_sequence<Is...>);
+      integer_sequence<bool,
+                       (is_formattable<typename std::tuple_element<Is, T>::type,
+                                       C>::value)...>{}));
 
  public:
   static constexpr const bool value =
@@ -421,6 +417,12 @@ struct is_formattable_delayed
 #endif
 }  // namespace detail
 
+template <typename...> struct conjunction : std::true_type {};
+template <typename P> struct conjunction<P> : P {};
+template <typename P1, typename... Pn>
+struct conjunction<P1, Pn...>
+    : conditional_t<bool(P1::value), conjunction<Pn...>, P1> {};
+
 template <typename T, typename Char, typename Enable = void>
 struct range_formatter;
 
@@ -486,7 +488,8 @@ struct range_formatter<
     for (; it != end; ++it) {
       if (i > 0) out = detail::copy_str<Char>(separator_, out);
       ctx.advance_to(out);
-      out = underlying_.format(mapper.map(*it), ctx);
+      auto&& item = *it;
+      out = underlying_.format(mapper.map(item), ctx);
       ++i;
     }
     out = detail::copy_str<Char>(closing_bracket_, out);
@@ -571,6 +574,83 @@ struct formatter<
                                       Char> {
 };
 
+template <typename It, typename Sentinel, typename Char = char>
+struct join_view : detail::view {
+  It begin;
+  Sentinel end;
+  basic_string_view<Char> sep;
+
+  join_view(It b, Sentinel e, basic_string_view<Char> s)
+      : begin(b), end(e), sep(s) {}
+};
+
+template <typename It, typename Sentinel, typename Char>
+struct formatter<join_view<It, Sentinel, Char>, Char> {
+ private:
+  using value_type =
+#ifdef __cpp_lib_ranges
+      std::iter_value_t<It>;
+#else
+      typename std::iterator_traits<It>::value_type;
+#endif
+  formatter<remove_cvref_t<value_type>, Char> value_formatter_;
+
+ public:
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const Char* {
+    return value_formatter_.parse(ctx);
+  }
+
+  template <typename FormatContext>
+  auto format(const join_view<It, Sentinel, Char>& value,
+              FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto it = value.begin;
+    auto out = ctx.out();
+    if (it != value.end) {
+      out = value_formatter_.format(*it, ctx);
+      ++it;
+      while (it != value.end) {
+        out = detail::copy_str<Char>(value.sep.begin(), value.sep.end(), out);
+        ctx.advance_to(out);
+        out = value_formatter_.format(*it, ctx);
+        ++it;
+      }
+    }
+    return out;
+  }
+};
+
+/**
+  Returns a view that formats the iterator range `[begin, end)` with elements
+  separated by `sep`.
+ */
+template <typename It, typename Sentinel>
+auto join(It begin, Sentinel end, string_view sep) -> join_view<It, Sentinel> {
+  return {begin, end, sep};
+}
+
+/**
+  \rst
+  Returns a view that formats `range` with elements separated by `sep`.
+
+  **Example**::
+
+    std::vector<int> v = {1, 2, 3};
+    fmt::print("{}", fmt::join(v, ", "));
+    // Output: "1, 2, 3"
+
+  ``fmt::join`` applies passed format specifiers to the range elements::
+
+    fmt::print("{:02}", fmt::join(v, ", "));
+    // Output: "01, 02, 03"
+  \endrst
+ */
+template <typename Range>
+auto join(Range&& range, string_view sep)
+    -> join_view<detail::iterator_t<Range>, detail::sentinel_t<Range>> {
+  return join(std::begin(range), std::end(range), sep);
+}
+
 template <typename Char, typename... T> struct tuple_join_view : detail::view {
   const std::tuple<T...>& tuple;
   basic_string_view<Char> sep;
@@ -705,13 +785,6 @@ FMT_CONSTEXPR auto join(const std::tuple<T...>& tuple, string_view sep)
   return {tuple, sep};
 }
 
-template <typename... T>
-FMT_CONSTEXPR auto join(const std::tuple<T...>& tuple,
-                        basic_string_view<wchar_t> sep)
-    -> tuple_join_view<wchar_t, T...> {
-  return {tuple, sep};
-}
-
 /**
   \rst
   Returns an object that formats `initializer_list` with elements separated by
diff --git a/src/fmt/std.h b/src/fmt/std.h
index b4e055c28d..7cff115920 100644
--- a/src/fmt/std.h
+++ b/src/fmt/std.h
@@ -38,6 +38,10 @@
 #  endif
 #endif
 
+#if FMT_CPLUSPLUS > 201703L && FMT_HAS_INCLUDE(<source_location>)
+#  include <source_location>
+#endif
+
 // GCC 4 does not support FMT_HAS_INCLUDE.
 #if FMT_HAS_INCLUDE(<cxxabi.h>) || defined(__GLIBCXX__)
 #  include <cxxabi.h>
@@ -59,43 +63,53 @@
 #  endif
 #endif
 
-#ifdef __cpp_lib_filesystem
+// For older Xcode versions, __cpp_lib_xxx flags are inaccurately defined.
+#ifndef FMT_CPP_LIB_FILESYSTEM
+#  ifdef __cpp_lib_filesystem
+#    define FMT_CPP_LIB_FILESYSTEM __cpp_lib_filesystem
+#  else
+#    define FMT_CPP_LIB_FILESYSTEM 0
+#  endif
+#endif
+
+#ifndef FMT_CPP_LIB_VARIANT
+#  ifdef __cpp_lib_variant
+#    define FMT_CPP_LIB_VARIANT __cpp_lib_variant
+#  else
+#    define FMT_CPP_LIB_VARIANT 0
+#  endif
+#endif
+
+#if FMT_CPP_LIB_FILESYSTEM
 FMT_BEGIN_NAMESPACE
 
 namespace detail {
 
-template <typename Char> auto get_path_string(const std::filesystem::path& p) {
-  return p.string<Char>();
+template <typename Char, typename PathChar>
+auto get_path_string(const std::filesystem::path& p,
+                     const std::basic_string<PathChar>& native) {
+  if constexpr (std::is_same_v<Char, char> && std::is_same_v<PathChar, wchar_t>)
+    return to_utf8<wchar_t>(native, to_utf8_error_policy::replace);
+  else
+    return p.string<Char>();
 }
 
-template <typename Char>
+template <typename Char, typename PathChar>
 void write_escaped_path(basic_memory_buffer<Char>& quoted,
-                        const std::filesystem::path& p) {
-  write_escaped_string<Char>(std::back_inserter(quoted), p.string<Char>());
-}
-
-#  ifdef _WIN32
-template <>
-inline auto get_path_string<char>(const std::filesystem::path& p) {
-  return to_utf8<wchar_t>(p.native(), to_utf8_error_policy::replace);
-}
-
-template <>
-inline void write_escaped_path<char>(memory_buffer& quoted,
-                                     const std::filesystem::path& p) {
-  auto buf = basic_memory_buffer<wchar_t>();
-  write_escaped_string<wchar_t>(std::back_inserter(buf), p.native());
-  bool valid = to_utf8<wchar_t>::convert(quoted, {buf.data(), buf.size()});
-  FMT_ASSERT(valid, "invalid utf16");
-}
-#  endif  // _WIN32
-
-template <>
-inline void write_escaped_path<std::filesystem::path::value_type>(
-    basic_memory_buffer<std::filesystem::path::value_type>& quoted,
-    const std::filesystem::path& p) {
-  write_escaped_string<std::filesystem::path::value_type>(
-      std::back_inserter(quoted), p.native());
+                        const std::filesystem::path& p,
+                        const std::basic_string<PathChar>& native) {
+  if constexpr (std::is_same_v<Char, char> &&
+                std::is_same_v<PathChar, wchar_t>) {
+    auto buf = basic_memory_buffer<wchar_t>();
+    write_escaped_string<wchar_t>(std::back_inserter(buf), native);
+    bool valid = to_utf8<wchar_t>::convert(quoted, {buf.data(), buf.size()});
+    FMT_ASSERT(valid, "invalid utf16");
+  } else if constexpr (std::is_same_v<Char, PathChar>) {
+    write_escaped_string<std::filesystem::path::value_type>(
+        std::back_inserter(quoted), native);
+  } else {
+    write_escaped_string<Char>(std::back_inserter(quoted), p.string<Char>());
+  }
 }
 
 }  // namespace detail
@@ -106,6 +120,7 @@ template <typename Char> struct formatter<std::filesystem::path, Char> {
   format_specs<Char> specs_;
   detail::arg_ref<Char> width_ref_;
   bool debug_ = false;
+  char path_type_ = 0;
 
  public:
   FMT_CONSTEXPR void set_debug_format(bool set = true) { debug_ = set; }
@@ -122,29 +137,62 @@ template <typename Char> struct formatter<std::filesystem::path, Char> {
       debug_ = true;
       ++it;
     }
+    if (it != end && (*it == 'g')) path_type_ = *it++;
     return it;
   }
 
   template <typename FormatContext>
   auto format(const std::filesystem::path& p, FormatContext& ctx) const {
     auto specs = specs_;
+#  ifdef _WIN32
+    auto path_string = !path_type_ ? p.native() : p.generic_wstring();
+#  else
+    auto path_string = !path_type_ ? p.native() : p.generic_string();
+#  endif
+
     detail::handle_dynamic_spec<detail::width_checker>(specs.width, width_ref_,
                                                        ctx);
     if (!debug_) {
-      auto s = detail::get_path_string<Char>(p);
+      auto s = detail::get_path_string<Char>(p, path_string);
       return detail::write(ctx.out(), basic_string_view<Char>(s), specs);
     }
     auto quoted = basic_memory_buffer<Char>();
-    detail::write_escaped_path(quoted, p);
+    detail::write_escaped_path(quoted, p, path_string);
     return detail::write(ctx.out(),
                          basic_string_view<Char>(quoted.data(), quoted.size()),
                          specs);
   }
 };
 FMT_END_NAMESPACE
-#endif
+#endif  // FMT_CPP_LIB_FILESYSTEM
 
 FMT_BEGIN_NAMESPACE
+FMT_EXPORT
+template <std::size_t N, typename Char>
+struct formatter<std::bitset<N>, Char> : nested_formatter<string_view> {
+ private:
+  // Functor because C++11 doesn't support generic lambdas.
+  struct writer {
+    const std::bitset<N>& bs;
+
+    template <typename OutputIt>
+    FMT_CONSTEXPR auto operator()(OutputIt out) -> OutputIt {
+      for (auto pos = N; pos > 0; --pos) {
+        out = detail::write<Char>(out, bs[pos - 1] ? Char('1') : Char('0'));
+      }
+
+      return out;
+    }
+  };
+
+ public:
+  template <typename FormatContext>
+  auto format(const std::bitset<N>& bs, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return write_padded(ctx, writer{bs});
+  }
+};
+
 FMT_EXPORT
 template <typename Char>
 struct formatter<std::thread::id, Char> : basic_ostream_formatter<Char> {};
@@ -180,7 +228,7 @@ struct formatter<std::optional<T>, Char,
   }
 
   template <typename FormatContext>
-  auto format(std::optional<T> const& opt, FormatContext& ctx) const
+  auto format(const std::optional<T>& opt, FormatContext& ctx) const
       -> decltype(ctx.out()) {
     if (!opt) return detail::write<Char>(ctx.out(), none);
 
@@ -194,7 +242,32 @@ struct formatter<std::optional<T>, Char,
 FMT_END_NAMESPACE
 #endif  // __cpp_lib_optional
 
-#ifdef __cpp_lib_variant
+#ifdef __cpp_lib_source_location
+FMT_BEGIN_NAMESPACE
+FMT_EXPORT
+template <> struct formatter<std::source_location> {
+  template <typename ParseContext> FMT_CONSTEXPR auto parse(ParseContext& ctx) {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  auto format(const std::source_location& loc, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto out = ctx.out();
+    out = detail::write(out, loc.file_name());
+    out = detail::write(out, ':');
+    out = detail::write<char>(out, loc.line());
+    out = detail::write(out, ':');
+    out = detail::write<char>(out, loc.column());
+    out = detail::write(out, ": ");
+    out = detail::write(out, loc.function_name());
+    return out;
+  }
+};
+FMT_END_NAMESPACE
+#endif
+
+#if FMT_CPP_LIB_VARIANT
 FMT_BEGIN_NAMESPACE
 namespace detail {
 
@@ -285,7 +358,7 @@ struct formatter<
   }
 };
 FMT_END_NAMESPACE
-#endif  // __cpp_lib_variant
+#endif  // FMT_CPP_LIB_VARIANT
 
 FMT_BEGIN_NAMESPACE
 FMT_EXPORT
@@ -309,7 +382,7 @@ template <typename Char> struct formatter<std::error_code, Char> {
 FMT_EXPORT
 template <typename T, typename Char>
 struct formatter<
-    T, Char,
+    T, Char,  // DEPRECATED! Mixing code unit types.
     typename std::enable_if<std::is_base_of<std::exception, T>::value>::type> {
  private:
   bool with_typename_ = false;
@@ -340,7 +413,7 @@ struct formatter<
 #  ifdef FMT_HAS_ABI_CXA_DEMANGLE
     int status = 0;
     std::size_t size = 0;
-    std::unique_ptr<char, decltype(&std::free)> demangled_name_ptr(
+    std::unique_ptr<char, void (*)(void*)> demangled_name_ptr(
         abi::__cxa_demangle(ti.name(), nullptr, &size, &status), &std::free);
 
     string_view demangled_name_view;
@@ -451,15 +524,14 @@ struct formatter<std::atomic<T>, Char,
 #ifdef __cpp_lib_atomic_flag_test
 FMT_EXPORT
 template <typename Char>
-struct formatter<std::atomic_flag, Char>
-    : formatter<bool, Char> {
+struct formatter<std::atomic_flag, Char> : formatter<bool, Char> {
   template <typename FormatContext>
   auto format(const std::atomic_flag& v, FormatContext& ctx) const
       -> decltype(ctx.out()) {
     return formatter<bool, Char>::format(v.test(), ctx);
   }
 };
-#endif // __cpp_lib_atomic_flag_test
+#endif  // __cpp_lib_atomic_flag_test
 
 FMT_END_NAMESPACE
 #endif  // FMT_STD_H_
diff --git a/src/fmt/xchar.h b/src/fmt/xchar.h
index 625ec36922..1e791bb07b 100644
--- a/src/fmt/xchar.h
+++ b/src/fmt/xchar.h
@@ -11,6 +11,7 @@
 #include <cwchar>
 
 #include "format.h"
+#include "ranges.h"
 
 #ifndef FMT_STATIC_THOUSANDS_SEPARATOR
 #  include <locale>
@@ -22,7 +23,7 @@ namespace detail {
 template <typename T>
 using is_exotic_char = bool_constant<!std::is_same<T, char>::value>;
 
-inline auto write_loc(std::back_insert_iterator<detail::buffer<wchar_t>> out,
+inline auto write_loc(back_insert_iterator<detail::buffer<wchar_t>> out,
                       loc_value value, const format_specs<wchar_t>& specs,
                       locale_ref loc) -> bool {
 #ifndef FMT_STATIC_THOUSANDS_SEPARATOR
@@ -63,14 +64,15 @@ template <> struct is_char<char16_t> : std::true_type {};
 template <> struct is_char<char32_t> : std::true_type {};
 
 template <typename... T>
-constexpr format_arg_store<wformat_context, T...> make_wformat_args(
-    const T&... args) {
+constexpr auto make_wformat_args(const T&... args)
+    -> format_arg_store<wformat_context, T...> {
   return {args...};
 }
 
 inline namespace literals {
 #if FMT_USE_USER_DEFINED_LITERALS && !FMT_USE_NONTYPE_TEMPLATE_ARGS
-constexpr detail::udl_arg<wchar_t> operator"" _a(const wchar_t* s, size_t) {
+constexpr auto operator""_a(const wchar_t* s, size_t)
+    -> detail::udl_arg<wchar_t> {
   return {s};
 }
 #endif
@@ -95,6 +97,12 @@ auto join(std::initializer_list<T> list, wstring_view sep)
   return join(std::begin(list), std::end(list), sep);
 }
 
+template <typename... T>
+auto join(const std::tuple<T...>& tuple, basic_string_view<wchar_t> sep)
+    -> tuple_join_view<wchar_t, T...> {
+  return {tuple, sep};
+}
+
 template <typename Char, FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
 auto vformat(basic_string_view<Char> format_str,
              basic_format_args<buffer_context<type_identity_t<Char>>> args)
@@ -172,11 +180,11 @@ inline auto vformat_to(
   return detail::get_iterator(buf, out);
 }
 
-template <
-    typename OutputIt, typename Locale, typename S, typename... T,
-    typename Char = char_t<S>,
-    bool enable = detail::is_output_iterator<OutputIt, Char>::value&&
-        detail::is_locale<Locale>::value&& detail::is_exotic_char<Char>::value>
+template <typename OutputIt, typename Locale, typename S, typename... T,
+          typename Char = char_t<S>,
+          bool enable = detail::is_output_iterator<OutputIt, Char>::value &&
+                        detail::is_locale<Locale>::value &&
+                        detail::is_exotic_char<Char>::value>
 inline auto format_to(OutputIt out, const Locale& loc, const S& format_str,
                       T&&... args) ->
     typename std::enable_if<enable, OutputIt>::type {
diff --git a/src/fmtlib_os.cpp b/src/fmtlib_os.cpp
index ff5fe79a5e..3338d13cae 100644
--- a/src/fmtlib_os.cpp
+++ b/src/fmtlib_os.cpp
@@ -19,8 +19,8 @@
 #  include <sys/stat.h>
 #  include <sys/types.h>
 
-#  ifdef _WRS_KERNEL   // VxWorks7 kernel
-#    include <ioLib.h> // getpagesize
+#  ifdef _WRS_KERNEL    // VxWorks7 kernel
+#    include <ioLib.h>  // getpagesize
 #  endif
 
 #  ifndef _WIN32
@@ -183,10 +183,14 @@ void buffered_file::close() {
 }
 
 int buffered_file::descriptor() const {
-#ifdef fileno  // fileno is a macro on OpenBSD so we cannot use FMT_POSIX_CALL.
-  int fd = fileno(file_);
-#else
+#if !defined(fileno)
   int fd = FMT_POSIX_CALL(fileno(file_));
+#elif defined(FMT_HAS_SYSTEM)
+  // fileno is a macro on OpenBSD so we cannot use FMT_POSIX_CALL.
+#  define FMT_DISABLE_MACRO
+  int fd = FMT_SYSTEM(fileno FMT_DISABLE_MACRO(file_));
+#else
+  int fd = fileno(file_);
 #endif
   if (fd == -1)
     FMT_THROW(system_error(errno, FMT_STRING("cannot get file descriptor")));
@@ -197,6 +201,7 @@ int buffered_file::descriptor() const {
 #  ifdef _WIN32
 using mode_t = int;
 #  endif
+
 constexpr mode_t default_open_mode =
     S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
 
@@ -298,29 +303,6 @@ void file::dup2(int fd, std::error_code& ec) noexcept {
   if (result == -1) ec = std::error_code(errno, std::generic_category());
 }
 
-void file::pipe(file& read_end, file& write_end) {
-  // Close the descriptors first to make sure that assignments don't throw
-  // and there are no leaks.
-  read_end.close();
-  write_end.close();
-  int fds[2] = {};
-#  ifdef _WIN32
-  // Make the default pipe capacity same as on Linux 2.6.11+.
-  enum { DEFAULT_CAPACITY = 65536 };
-  int result = FMT_POSIX_CALL(pipe(fds, DEFAULT_CAPACITY, _O_BINARY));
-#  else
-  // Don't retry as the pipe function doesn't return EINTR.
-  // http://pubs.opengroup.org/onlinepubs/009696799/functions/pipe.html
-  int result = FMT_POSIX_CALL(pipe(fds));
-#  endif
-  if (result != 0)
-    FMT_THROW(system_error(errno, FMT_STRING("cannot create pipe")));
-  // The following assignments don't throw because read_fd and write_fd
-  // are closed.
-  read_end = file(fds[0]);
-  write_end = file(fds[1]);
-}
-
 buffered_file file::fdopen(const char* mode) {
 // Don't retry as fdopen doesn't return EINTR.
 #  if defined(__MINGW32__) && defined(_POSIX_)
@@ -349,6 +331,24 @@ file file::open_windows_file(wcstring_view path, int oflag) {
 }
 #  endif
 
+pipe::pipe() {
+  int fds[2] = {};
+#  ifdef _WIN32
+  // Make the default pipe capacity same as on Linux 2.6.11+.
+  enum { DEFAULT_CAPACITY = 65536 };
+  int result = FMT_POSIX_CALL(pipe(fds, DEFAULT_CAPACITY, _O_BINARY));
+#  else
+  // Don't retry as the pipe function doesn't return EINTR.
+  // http://pubs.opengroup.org/onlinepubs/009696799/functions/pipe.html
+  int result = FMT_POSIX_CALL(pipe(fds));
+#  endif
+  if (result != 0)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot create pipe")));
+  // The following assignments don't throw.
+  read_end = file(fds[0]);
+  write_end = file(fds[1]);
+}
+
 #  if !defined(__MSDOS__)
 long getpagesize() {
 #    ifdef _WIN32
@@ -371,18 +371,17 @@ long getpagesize() {
 
 namespace detail {
 
-void file_buffer::grow(size_t) {
-  if (this->size() == this->capacity()) flush();
+void file_buffer::grow(buffer<char>& buf, size_t) {
+  if (buf.size() == buf.capacity()) static_cast<file_buffer&>(buf).flush();
 }
 
-file_buffer::file_buffer(cstring_view path,
-                         const detail::ostream_params& params)
-    : file_(path, params.oflag) {
+file_buffer::file_buffer(cstring_view path, const ostream_params& params)
+    : buffer<char>(grow), file_(path, params.oflag) {
   set(new char[params.buffer_size], params.buffer_size);
 }
 
 file_buffer::file_buffer(file_buffer&& other)
-    : detail::buffer<char>(other.data(), other.size(), other.capacity()),
+    : buffer<char>(grow, other.data(), other.size(), other.capacity()),
       file_(std::move(other.file_)) {
   other.clear();
   other.set(nullptr, 0);
diff --git a/src/min_linesearch.cpp b/src/min_linesearch.cpp
index 24ba4c5c23..f875d4249e 100644
--- a/src/min_linesearch.cpp
+++ b/src/min_linesearch.cpp
@@ -329,7 +329,7 @@ int MinLineSearch::linemin_quadratic(double eoriginal, double &alpha)
   double fdothall,fdothme,hme,hmax,hmaxall;
   double de_ideal,de;
   double delfh,engprev,relerr,alphaprev,fhprev,fh,alpha0;
-  double dot[2],dotall[2];
+  double dot,dotall;
   double *xatom,*x0atom,*fatom,*hatom;
   double alphamax;
 
@@ -417,10 +417,9 @@ int MinLineSearch::linemin_quadratic(double eoriginal, double &alpha)
 
     // compute new fh, alpha, delfh
 
-    dot[0] = dot[1] = 0.0;
+    dot = 0.0;
     for (i = 0; i < nvec; i++) {
-      dot[0] += fvec[i]*fvec[i];
-      dot[1] += fvec[i]*h[i];
+      dot += fvec[i]*h[i];
     }
     if (nextra_atom)
       for (m = 0; m < nextra_atom; m++) {
@@ -428,18 +427,16 @@ int MinLineSearch::linemin_quadratic(double eoriginal, double &alpha)
         hatom = hextra_atom[m];
         n = extra_nlen[m];
         for (i = 0; i < n; i++) {
-          dot[0] += fatom[i]*fatom[i];
-          dot[1] += fatom[i]*hatom[i];
+          dot += fatom[i]*hatom[i];
         }
       }
-    MPI_Allreduce(dot,dotall,2,MPI_DOUBLE,MPI_SUM,world);
+    MPI_Allreduce(&dot,&dotall,1,MPI_DOUBLE,MPI_SUM,world);
     if (nextra_global) {
       for (i = 0; i < nextra_global; i++) {
-        dotall[0] += fextra[i]*fextra[i];
-        dotall[1] += fextra[i]*hextra[i];
+        dotall += fextra[i]*hextra[i];
       }
     }
-    fh = dotall[1];
+    fh = dotall;
     if (output->thermo->normflag) fh /= atom->natoms;
 
     delfh = fh - fhprev;
diff --git a/src/molecule.cpp b/src/molecule.cpp
index 6e2d3891d3..f83d8658df 100644
--- a/src/molecule.cpp
+++ b/src/molecule.cpp
@@ -41,15 +41,16 @@ using namespace LAMMPS_NS;
 
 Molecule::Molecule(LAMMPS *lmp, int narg, char **arg, int &index) :
     Pointers(lmp), id(nullptr), x(nullptr), type(nullptr), molecule(nullptr), q(nullptr),
-    radius(nullptr), rmass(nullptr), num_bond(nullptr), bond_type(nullptr), bond_atom(nullptr),
-    num_angle(nullptr), angle_type(nullptr), angle_atom1(nullptr), angle_atom2(nullptr),
-    angle_atom3(nullptr), num_dihedral(nullptr), dihedral_type(nullptr), dihedral_atom1(nullptr),
-    dihedral_atom2(nullptr), dihedral_atom3(nullptr), dihedral_atom4(nullptr),
-    num_improper(nullptr), improper_type(nullptr), improper_atom1(nullptr), improper_atom2(nullptr),
-    improper_atom3(nullptr), improper_atom4(nullptr), nspecial(nullptr), special(nullptr),
-    shake_flag(nullptr), shake_atom(nullptr), shake_type(nullptr), avec_body(nullptr),
-    ibodyparams(nullptr), dbodyparams(nullptr), fragmentmask(nullptr), dx(nullptr), dxcom(nullptr),
-    dxbody(nullptr), quat_external(nullptr), fp(nullptr), count(nullptr)
+    radius(nullptr), rmass(nullptr), mu(nullptr), num_bond(nullptr), bond_type(nullptr),
+    bond_atom(nullptr), num_angle(nullptr), angle_type(nullptr), angle_atom1(nullptr),
+    angle_atom2(nullptr), angle_atom3(nullptr), num_dihedral(nullptr), dihedral_type(nullptr),
+    dihedral_atom1(nullptr), dihedral_atom2(nullptr), dihedral_atom3(nullptr),
+    dihedral_atom4(nullptr), num_improper(nullptr), improper_type(nullptr), improper_atom1(nullptr),
+    improper_atom2(nullptr), improper_atom3(nullptr), improper_atom4(nullptr), nspecial(nullptr),
+    special(nullptr), shake_flag(nullptr), shake_atom(nullptr), shake_type(nullptr),
+    avec_body(nullptr), ibodyparams(nullptr), dbodyparams(nullptr), fragmentmask(nullptr),
+    dx(nullptr), dxcom(nullptr), dxbody(nullptr), quat_external(nullptr), fp(nullptr),
+    count(nullptr)
 {
   me = comm->me;
 
@@ -132,7 +133,7 @@ Molecule::Molecule(LAMMPS *lmp, int narg, char **arg, int &index) :
 
   // initialize all fields to empty
 
-  initialize();
+  Molecule::initialize();
 
   // scan file for sizes of all fields and allocate storage for them
 
@@ -141,28 +142,30 @@ Molecule::Molecule(LAMMPS *lmp, int narg, char **arg, int &index) :
     if (fp == nullptr)
       error->one(FLERR, "Cannot open molecule file {}: {}", arg[ifile], utils::getsyserror());
   }
-  read(0);
+  Molecule::read(0);
   if (me == 0) fclose(fp);
-  allocate();
+  Molecule::allocate();
 
   // read file again to populate all fields
 
   if (me == 0) fp = fopen(arg[ifile], "r");
-  read(1);
+  Molecule::read(1);
   if (me == 0) fclose(fp);
 
   // stats
 
+  if (title.empty()) title = "(no title)";
   if (me == 0)
     utils::logmesg(lmp,
-                   "Read molecule template {}:\n  {} molecules\n"
+                   "Read molecule template {}:\n{}\n"
+                   "  {} molecules\n"
                    "  {} fragments\n"
                    "  {} atoms with max type {}\n"
                    "  {} bonds with max type {}\n"
                    "  {} angles with max type {}\n"
                    "  {} dihedrals with max type {}\n"
                    "  {} impropers with max type {}\n",
-                   id, nmolecules, nfragments, natoms, ntypes, nbonds, nbondtypes, nangles,
+                   id, title, nmolecules, nfragments, natoms, ntypes, nbonds, nbondtypes, nangles,
                    nangletypes, ndihedrals, ndihedraltypes, nimpropers, nimpropertypes);
 }
 
@@ -423,6 +426,8 @@ void Molecule::read(int flag)
     if (eof == nullptr) error->one(FLERR, "Unexpected end of molecule file");
   }
 
+  if (flag == 0) title = utils::trim(line);
+
   // read header lines
   // skip blank lines or lines that start with "#"
   // stop when read an unrecognized line
@@ -572,6 +577,12 @@ void Molecule::read(int flag)
         diameters(line);
       else
         skip_lines(natoms, line, keyword);
+    } else if (keyword == "Dipoles") {
+      muflag = 1;
+      if (flag)
+        dipoles(line);
+      else
+        skip_lines(natoms, line, keyword);
     } else if (keyword == "Masses") {
       rmassflag = 1;
       if (flag)
@@ -948,6 +959,40 @@ void Molecule::diameters(char *line)
   }
 }
 
+/* ----------------------------------------------------------------------
+   read charges from file
+------------------------------------------------------------------------- */
+
+void Molecule::dipoles(char *line)
+{
+  for (int i = 0; i < natoms; i++) count[i] = 0;
+  try {
+    for (int i = 0; i < natoms; i++) {
+      readline(line);
+
+      ValueTokenizer values(utils::trim_comment(line));
+      if ((int) values.count() != 4)
+        error->all(FLERR, "Invalid line in Dipoles section of molecule file: {}", line);
+
+      int iatom = values.next_int() - 1;
+      if (iatom < 0 || iatom >= natoms)
+        error->all(FLERR, "Invalid atom index in Dipoles section of molecule file");
+
+      count[iatom]++;
+      mu[iatom][0] = values.next_double();
+      mu[iatom][1] = values.next_double();
+      mu[iatom][2] = values.next_double();
+    }
+  } catch (TokenizerException &e) {
+    error->all(FLERR, "Invalid line in Dipoles section of molecule file: {}\n{}", e.what(), line);
+  }
+
+  for (int i = 0; i < natoms; i++) {
+    if (count[i] == 0)
+      error->all(FLERR, "Atom {} missing in Dipoles section of molecule file", i + 1);
+  }
+}
+
 /* ----------------------------------------------------------------------
    read masses from file
 ------------------------------------------------------------------------- */
@@ -1828,6 +1873,7 @@ void Molecule::check_attributes()
 
   int mismatch = 0;
   if (qflag && !atom->q_flag) mismatch = 1;
+  if (muflag && !atom->mu_flag) mismatch = 1;
   if (radiusflag && !atom->radius_flag) mismatch = 1;
   if (rmassflag && !atom->rmass_flag) mismatch = 1;
 
@@ -1869,6 +1915,7 @@ void Molecule::check_attributes()
 
 void Molecule::initialize()
 {
+  title.clear();
   natoms = 0;
   nbonds = nangles = ndihedrals = nimpropers = 0;
   ntypes = 0;
@@ -1880,7 +1927,7 @@ void Molecule::initialize()
   bond_per_atom = angle_per_atom = dihedral_per_atom = improper_per_atom = 0;
   maxspecial = 0;
 
-  xflag = typeflag = moleculeflag = fragmentflag = qflag = radiusflag = rmassflag = 0;
+  xflag = typeflag = moleculeflag = fragmentflag = qflag = radiusflag = muflag = rmassflag = 0;
   bondflag = angleflag = dihedralflag = improperflag = 0;
   nspecialflag = specialflag = 0;
   shakeflag = shakeflagflag = shakeatomflag = shaketypeflag = 0;
@@ -1943,6 +1990,7 @@ void Molecule::allocate()
       for (int j = 0; j < natoms; j++) fragmentmask[i][j] = 0;
   }
   if (qflag) memory->create(q, natoms, "molecule:q");
+  if (muflag) memory->create(mu, natoms, 3, "molecule:mu");
   if (radiusflag) memory->create(radius, natoms, "molecule:radius");
   if (rmassflag) memory->create(rmass, natoms, "molecule:rmass");
 
@@ -2167,6 +2215,11 @@ void Molecule::print()
     for (int i = 0; i < natoms; i++)
       printf("    %d %g\n",i+1,radius[i]);
   }
+  if (muflag) {
+    printf(  "Dipoles:\n");
+    for (int i = 0; i < natoms; i++)
+      printf("    %d %g %g %g\n",i+1,mu[i][0],mu[i][1],mu[i][2]);
+  }
   if (rmassflag) {
     printf(  "Masses:\n");
     for (int i = 0; i < natoms; i++)
diff --git a/src/molecule.h b/src/molecule.h
index 06a1211ea3..faba036aab 100644
--- a/src/molecule.h
+++ b/src/molecule.h
@@ -25,6 +25,8 @@ class Molecule : protected Pointers {
                // else 0 if not first in set
   int last;    // 1 if last molecule in set, else 0
 
+  std::string title;    // title string of the molecule file
+
   // number of atoms,bonds,etc in molecule
   // nibody,ndbody = # of integer/double fields in body
 
@@ -41,7 +43,7 @@ class Molecule : protected Pointers {
 
   // 1 if attribute defined in file, 0 if not
 
-  int xflag, typeflag, moleculeflag, fragmentflag, qflag, radiusflag, rmassflag;
+  int xflag, typeflag, moleculeflag, fragmentflag, qflag, radiusflag, muflag, rmassflag;
   int bondflag, angleflag, dihedralflag, improperflag;
   int nspecialflag, specialflag;
   int shakeflag, shakeflagflag, shakeatomflag, shaketypeflag;
@@ -63,6 +65,7 @@ class Molecule : protected Pointers {
   double *q;           // charge on each atom
   double *radius;      // radius of each atom
   double *rmass;       // mass of each atom
+  double **mu;         // dipole vector of each atom
 
   int *num_bond;    // bonds, angles, dihedrals, impropers for each atom
   int **bond_type;
@@ -142,6 +145,7 @@ class Molecule : protected Pointers {
   void fragments(char *);
   void charges(char *);
   void diameters(char *);
+  void dipoles(char *);
   void masses(char *);
   void bonds(int, char *);
   void angles(int, char *);
diff --git a/src/replicate.cpp b/src/replicate.cpp
index f5444b9fa8..e0fd5b0e3d 100644
--- a/src/replicate.cpp
+++ b/src/replicate.cpp
@@ -57,7 +57,12 @@ void Replicate::command(int narg, char **arg)
     error->all(FLERR, "Illegal replication grid {}x{}x{}. All replications must be > 0",
                nx, ny, nz);
 
-  int nrep = nx*ny*nz;
+  bigint nrepbig = (bigint) nx * ny * nz;
+  if (nrepbig > MAXSMALLINT)
+    error->all(FLERR, "Total # of replica is too large: {}x{}x{} = {}. "
+               "Please use replicate multiple times", nx, ny, nz, nrepbig);
+
+  int nrep = (int) nrepbig;
   if (me == 0)
     utils::logmesg(lmp, "Replication is creating a {}x{}x{} = {} times larger system...\n",
                    nx, ny, nz, nrep);
diff --git a/src/write_restart.cpp b/src/write_restart.cpp
index a996532687..ad279c14f6 100644
--- a/src/write_restart.cpp
+++ b/src/write_restart.cpp
@@ -545,7 +545,7 @@ void WriteRestart::force_fields()
    all procs call this method, only proc 0 writes to file
 ------------------------------------------------------------------------- */
 
-void WriteRestart::file_layout(int send_size)
+void WriteRestart::file_layout(int /*send_size*/)
 {
   if (me == 0) write_int(MULTIPROC,multiproc);
 
diff --git a/unittest/force-styles/tests/angle-cosine_periodic.yaml b/unittest/force-styles/tests/angle-cosine_periodic.yaml
index 84d8ff1194..5c8227fcbd 100644
--- a/unittest/force-styles/tests/angle-cosine_periodic.yaml
+++ b/unittest/force-styles/tests/angle-cosine_periodic.yaml
@@ -1,6 +1,6 @@
 ---
-lammps_version: 8 Apr 2021
-date_generated: Thu Apr  8 09:28:11 2021
+lammps_version: 21 Nov 2023
+date_generated: Fri Jan 12 18:39:55 2024
 epsilon: 2.5e-13
 prerequisites: ! |
   atom full
@@ -10,77 +10,77 @@ post_commands: ! ""
 input_file: in.fourmol
 angle_style: cosine/periodic
 angle_coeff: ! |
-  1  75.0  1 2
-  2  45.0 -1 2
+  1  75.0  1 1
+  2  45.0  1 2
   3  50.0 -1 3
   4 100.0 -1 4
-equilibrium: 4 3.141592653589793 1.5707963267948966 2.0943951023931957 2.356194490192345
+equilibrium: 4 3.141592653589793 3.141592653589793 2.0943951023931957 2.356194490192345
 extract: ! ""
 natoms: 29
-init_energy: 605.3643061001458
-init_stress: ! |-
-  -1.7082420754402889e+01 -7.3281097507808681e+00  2.4410530505183818e+01  8.5827033671406951e+01  1.4260977966148616e+02  4.1579557432232576e+01
+init_energy: 1178.5476942873006
+init_stress: ! |2-
+   2.7790958427902001e+02 -2.3729473006795436e+02 -4.0614854211065634e+01  2.9034222204142930e+02  1.4123449070173780e+02  2.0504975338277421e+02
 init_forces: ! |2
     1  7.9609486050127529e+00 -3.9274211736421961e+01 -3.8917410871887981e+01
     2  4.6997439470662350e+00  3.8052682089524090e+01  3.0599010994189470e+01
-    3 -7.1532072701475698e+01  9.6873528247272844e+01  7.3410935137796983e+01
-    4  3.1784763224659116e+01 -4.4133218046130608e+01 -6.2234613362865147e+01
-    5  5.8817481848549889e+01 -2.5112568523390145e+01  3.9611729278121981e+00
-    6 -8.7258065964885336e+00 -4.2663580774228997e+01 -1.6819642012415606e+01
+    3 -4.4330179925982058e+01 -1.6514501437366098e+00  1.9894582317318523e+01
+    4  1.1465928779203908e+01 -7.1462736556935234e+00 -1.8983545733370338e+01
+    5  2.7634466780141157e+01  1.5504150132065057e+01  1.0078115065618357e+01
+    6  2.2512674572611367e+01 -5.4260358088923418e+01 -6.0646506351853276e+01
     7 -1.5578858996464229e+01  1.3895348629116569e+01 -3.3939856789628062e+00
-    8 -1.6678237064738614e+01 -2.6557373913973738e+01  8.7708427797183326e+00
-    9 -9.4419020144376677e+00  1.3812152922900303e+01 -1.2280697239365450e+00
-   10  1.0844630504236606e+02  1.9274264686364820e+01  1.2594098114786526e+01
-   11 -1.1888648487599809e+01  1.7288532453781471e+00  1.8714004234488471e+00
-   12  9.7432958614920665e+01  1.1284647087939499e+02 -1.3445218835244805e+02
-   13 -2.2887258478933525e+01 -5.9815335453575649e+01  4.1237962971772127e+01
-   14 -4.6498844054867675e+01 -3.0251289808967520e+01  1.5556535565006259e+01
-   15 -5.3477741242848616e+01 -1.7885978453267143e+01  4.6284681424489207e+01
-   16 -7.3215663693592745e+01  1.7514552522777997e+01  7.4857846653898914e+00
+    8 -6.7011983808365656e+01 -2.4458090084467077e+01  1.7084632474743671e+02
+    9  9.4419020144376677e+00 -1.3812152922900303e+01  1.2280697239365450e+00
+   10  1.3360859023844577e+02  1.1499274633292617e+02 -1.0838863098947982e+02
+   11  1.1888648487599809e+01 -1.7288532453781471e+00 -1.8714004234488471e+00
+   12  2.9260930345940537e+01 -9.2146025429432186e+00 -8.5323421000107373e+01
+   13 -4.6656310032990458e+00 -1.2502935413462930e+01  1.4918864440944628e+01
+   14 -2.1383527724886850e+01 -9.3422692044635554e+00  7.5125645645164223e+00
+   15 -8.0644375221897171e+00 -2.6783296801963008e+00  6.9267625241565547e+00
+   16 -1.1822204683814408e+02  2.1928896047729104e+01  4.0247121672886962e+01
    17  2.0782832048872386e+01 -2.8304296512773977e+01  1.5273484998106287e+01
-   18  1.6481336531704756e+00  1.7222946144801426e+01 -6.9896289164966490e+01
-   19 -2.0180190840279820e+01 -2.5140421523544326e+01  2.9933594625645306e+01
-   20  1.8532057187109345e+01  7.9174753787429015e+00  3.9962694539321184e+01
-   21  1.6243294930835876e+01  2.0333921382774719e+01 -6.0768622624445221e+01
-   22 -2.8924589352090472e+01 -1.9720769613680826e+01  2.1482552755004811e+01
-   23  1.2681294421254595e+01 -6.1315176909389102e-01  3.9286069869440411e+01
-   24 -1.5837796600466618e+01  6.1562453937228881e+01 -3.6651923703785549e+01
-   25 -1.2704181131223443e+01 -4.2563815285902912e+01  6.9610494863238124e+00
-   26  2.8541977731690061e+01 -1.8998638651325965e+01  2.9690874217461737e+01
-   27 -8.7971258084923178e+00  7.2217511410368814e+01 -2.4599681382405976e+01
-   28 -1.9235439225569891e+01 -4.3179911322776611e+01  1.0030656861974458e+00
-   29  2.8032565034062209e+01 -2.9037600087592210e+01  2.3596615696208531e+01
-run_energy: 603.8182365368202
-run_stress: ! |-
-  -1.6098625319219664e+01 -7.7961962067566510e+00  2.3894821525976329e+01  8.7036156470651477e+01  1.4262918929621054e+02  4.2523803236880880e+01
+   18  5.2071052608093424e+00  5.4414090328604708e+01 -2.2082998810309599e+02
+   19 -6.3757194500832497e+01 -7.9428522633699004e+01  9.4572049876109048e+01
+   20  5.8550089240023155e+01  2.5014432305094296e+01  1.2625793822698694e+02
+   21  5.6300281919954635e+01  7.0478650499360143e+01 -2.1062786831190908e+02
+   22 -1.0025444602684506e+02 -6.8353427900946826e+01  7.4459879083463136e+01
+   23  4.3954164106890424e+01 -2.1252225984133197e+00  1.3616798922844595e+02
+   24 -4.9480288140032329e+01  1.9233281221276744e+02 -1.1450757902121047e+02
+   25 -3.9690277556511717e+01 -1.3297745247110566e+02  2.1747642240220362e+01
+   26  8.9170565696544045e+01 -5.9355359741661772e+01  9.2759936780990117e+01
+   27 -2.6339504856062320e+01  2.1622670107205670e+02 -7.3653991239272059e+01
+   28 -5.7592895215991106e+01 -1.2928512206483205e+02  3.0032824456190355e+00
+   29  8.3932400072053426e+01 -8.6941579007224647e+01  7.0650708793653024e+01
+run_energy: 1174.6225600630123
+run_stress: ! |2-
+   2.7658169122411005e+02 -2.3743377487623573e+02 -3.9147916347874407e+01  2.9007767114801470e+02  1.4053974438881829e+02  2.0434258995590761e+02
 run_forces: ! |2
-    1  8.1036664069391833e+00 -3.9279459516104339e+01 -3.8959949625007155e+01
-    2  4.6488532958171156e+00  3.7987813821226069e+01  3.0712083303318757e+01
-    3 -7.1419656269516480e+01  9.7015207052323333e+01  7.3123837986656483e+01
-    4  3.1774739774255771e+01 -4.4324760214341296e+01 -6.1918121921961003e+01
-    5  5.8630133295649813e+01 -2.5003101567718115e+01  3.8957656941403842e+00
-    6 -8.6686835699933500e+00 -4.2717543793109854e+01 -1.6944132920021204e+01
-    7 -1.5605967450730276e+01  1.3924972058096937e+01 -3.4081311693274161e+00
-    8 -1.6735469954990947e+01 -2.6654949908594496e+01  8.9412902423392993e+00
-    9 -9.4705763934675620e+00  1.3861186924074314e+01 -1.2218212802251793e+00
-   10  1.0864309846473817e+02  1.9311615651482960e+01  1.2534898619395602e+01
-   11 -1.1889594908454491e+01  1.6849924892427488e+00  1.9039966312260486e+00
-   12  9.6643785665770423e+01  1.1329932305772147e+02 -1.3435213826206018e+02
-   13 -2.2815824864999897e+01 -5.9701629573330088e+01  4.1148977584672039e+01
-   14 -4.6226658006998740e+01 -3.0469540424436548e+01  1.5534272011399247e+01
-   15 -5.3141801628038777e+01 -1.8156497866651446e+01  4.6272398149175629e+01
-   16 -7.3254211788300807e+01  1.7569251761827239e+01  7.4522974142679850e+00
-   17  2.0784167932320894e+01 -2.8346879951708846e+01  1.5284477542010659e+01
-   18  1.7456021018344252e+00  1.7528557172698406e+01 -7.0852460721917453e+01
-   19 -2.0389936120749365e+01 -2.5462340563923114e+01  3.0421727677614534e+01
-   20  1.8644334018914940e+01  7.9337833912247095e+00  4.0430733044302912e+01
-   21  1.6517268317097550e+01  2.0531536618559141e+01 -6.1717967915716365e+01
-   22 -2.9293957935776255e+01 -1.9905577364456363e+01  2.1870035659045151e+01
-   23  1.2776689618678706e+01 -6.2595925410277875e-01  3.9847932256671214e+01
-   24 -1.6067082221526842e+01  6.2373469754139357e+01 -3.7096821397423525e+01
-   25 -1.2753486814048248e+01 -4.3101082367336026e+01  7.0662489242667057e+00
-   26  2.8820569035575090e+01 -1.9272387386803331e+01  3.0030572473156820e+01
-   27 -8.9233162938210242e+00  7.2669056612963558e+01 -2.4610439704365813e+01
-   28 -1.9256705992379011e+01 -4.3442840232212284e+01  9.5666525994413210e-01
-   29  2.8180022286200035e+01 -2.9226216380751275e+01  2.3653774444421682e+01
+    1  8.0595702750384035e+00 -3.9275884134753326e+01 -3.8921834417294036e+01
+    2  4.6450877605699539e+00  3.7989319483282912e+01  3.0709930248716290e+01
+    3 -4.4176357886610745e+01 -1.3121510542286003e+00  1.9849684676752698e+01
+    4  1.1432955202502885e+01 -7.3978491141098957e+00 -1.8963452056001909e+01
+    5  2.7565769767176914e+01  1.5533965780817836e+01  1.0064393045239932e+01
+    6  2.2440837721485856e+01 -5.4307979505823312e+01 -6.0734450726614625e+01
+    7 -1.5580688823052480e+01  1.3904189059068386e+01 -3.4017896378595758e+00
+    8 -6.6989876135866879e+01 -2.4455457095150752e+01  1.7071695622632274e+02
+    9  9.4762227087055635e+00 -1.3904425552883753e+01  1.2252549039361496e+00
+   10  1.3329492642527092e+02  1.1514887273699682e+02 -1.0807688660290995e+02
+   11  1.1927511834955308e+01 -1.7182396158290132e+00 -1.8914765821083073e+00
+   12  2.9230443011207992e+01 -9.0747074093425084e+00 -8.5406656692466896e+01
+   13 -4.6010476121847610e+00 -1.2371262892106342e+01  1.4758380429325644e+01
+   14 -2.1309655373546295e+01 -9.6560166053345498e+00  7.4826455796077642e+00
+   15 -8.0586553706859778e+00 -2.8089895416921884e+00  7.1963114045665719e+00
+   16 -1.1814487049351524e+02  2.2070805476502699e+01  4.0103979455896329e+01
+   17  2.0787826988548556e+01 -2.8364190015414366e+01  1.5289010744891176e+01
+   18  5.4411962659043454e+00  5.4597888596162299e+01 -2.2067472725627243e+02
+   19 -6.3374090856904559e+01 -7.9190934240040519e+01  9.4782037192716302e+01
+   20  5.7932894591000213e+01  2.4593045643878220e+01  1.2589269006355613e+02
+   21  5.6478944470524624e+01  7.0203094061683373e+01 -2.1102883364979709e+02
+   22 -9.9996788696603545e+01 -6.7985401318866863e+01  7.4849357252797518e+01
+   23  4.3517844226078921e+01 -2.2176927428165065e+00  1.3617947639699958e+02
+   24 -4.9663522759553963e+01  1.9280842870258854e+02 -1.1467096111871800e+02
+   25 -3.9280982289108742e+01 -1.3314956089589265e+02  2.1920816048609726e+01
+   26  8.8944505048662705e+01 -5.9658867806695888e+01  9.2750145070108275e+01
+   27 -2.6592186096443989e+01  2.1652356998534560e+02 -7.3330722990707770e+01
+   28 -5.7218787679563256e+01 -1.2939713888992102e+02  2.9266537226802889e+00
+   29  8.3810973776007245e+01 -8.7126431095424564e+01  7.0404069268027484e+01
 ...
diff --git a/unittest/force-styles/tests/angle-lepton.yaml b/unittest/force-styles/tests/angle-lepton.yaml
index ea108cfdb1..b4d6c0516f 100644
--- a/unittest/force-styles/tests/angle-lepton.yaml
+++ b/unittest/force-styles/tests/angle-lepton.yaml
@@ -9,7 +9,7 @@ prerequisites: ! |
 pre_commands: ! ""
 post_commands: ! ""
 input_file: in.fourmol
-angle_style: lepton
+angle_style: lepton auto_offset
 angle_coeff: ! |
   1 110.1 "k*theta^2; k=75.0"
   2 111.0 "k*theta^2; k=45.0"
diff --git a/unittest/force-styles/tests/angle-lepton_nooffset.yaml b/unittest/force-styles/tests/angle-lepton_nooffset.yaml
new file mode 100644
index 0000000000..711f0cbdd5
--- /dev/null
+++ b/unittest/force-styles/tests/angle-lepton_nooffset.yaml
@@ -0,0 +1,88 @@
+---
+lammps_version: 22 Dec 2022
+date_generated: Fri Dec 23 15:10:29 2022
+epsilon: 7.5e-13
+skip_tests:
+prerequisites: ! |
+  atom full
+  angle lepton
+pre_commands: ! ""
+post_commands: ! ""
+input_file: in.fourmol
+angle_style: lepton no_offset
+angle_coeff: ! |
+  1 110.1 "k*theta^2; k=75.0"
+  2 111.0 "k*theta^2; k=45.0"
+  3 120.0 "k*theta^2; k=50.0"
+  4 108.5 "k*theta^2; k=100.0"
+equilibrium: 4 1.9216075064457567 1.9373154697137058 2.0943951023931953 1.8936822384138476
+extract: ! |
+  theta0 1
+natoms: 29
+init_energy: 41.53081789649104
+init_stress: ! |2-
+   8.9723357320869297e+01 -8.7188643750026529e+01 -2.5347135708427655e+00  9.2043419883119782e+01 -2.8187238090404904e+01 -1.5291148024926793e+00
+init_forces: ! |2
+    1  4.7865489310693540e+01  7.8760925902181516e+00 -3.2694525514709866e+01
+    2 -1.1124882516177341e+00 -9.0075464203887403e+00 -7.2431691227364459e+00
+    3 -5.9057050592859328e+00  5.3263619873546261e+01  5.2353380124691469e+01
+    4 -1.6032230038990633e+01 -2.4560529343731403e+01  1.2891625920422307e+01
+    5 -4.4802331573497639e+01 -4.8300919461089379e+01 -2.3310767889219324e+01
+    6  4.7083124388174824e+01 -9.5212933434476312e+00 -3.2526392870546800e+01
+    7 -1.6208182775476303e+01  1.4458587960739102e+01 -3.5314745459502710e+00
+    8 -6.5664612141881040e+00 -2.5126850154274202e+01  8.2187944731423329e+01
+    9 -1.5504395262358301e+01  1.6121044185227817e+01 -4.2007069622477866e-01
+   10  9.9863759179365275e+00  4.1873540105704549e+01 -6.6085640966037403e+01
+   11 -2.0441876158908627e+01 -6.5186824168985984e+00  9.0023620309811072e+00
+   12 -1.0772126658369565e+01 -1.0807367300158219e+01 -9.6049647456797871e+00
+   13  2.8847886813946291e+00  7.2973241014859198e+00 -1.0414233993842981e-01
+   14  1.5267407478336393e+01 -9.4754911480231776e+00 -6.6307012925544200e+00
+   15  1.2402914209534773e+01 -6.2644630791613967e+00  1.8484576795819933e+01
+   16  3.8927757686508357e-01  1.0690061587911176e+01  6.1542759189377696e+00
+   17  1.4664194297570785e+00 -1.9971277376602425e+00  1.0776844613215999e+00
+   18  1.5785371874873322e-01  1.6495665212200166e+00 -6.6944747776990434e+00
+   19 -1.9328033033421670e+00 -2.4078805870919706e+00  2.8669575541313534e+00
+   20  1.7749495845934338e+00  7.5831406587195394e-01  3.8275172235676900e+00
+   21  3.4186149299343742e+00  4.2795410364249484e+00 -1.2789555411020650e+01
+   22 -6.0875600315279677e+00 -4.1504951869796605e+00  4.5212856070195766e+00
+   23  2.6689451015935934e+00 -1.2904584944528752e-01  8.2682698040010738e+00
+   24 -1.3053945393770587e+00  5.0741459325183271e+00 -3.0209518576073018e+00
+   25 -1.0471133765834284e+00 -3.5082261409793856e+00  5.7374874908501228e-01
+   26  2.3525079159604871e+00 -1.5659197915389413e+00  2.4472031085222894e+00
+   27 -2.8720725187343754e-01  2.3577465459557132e+00 -8.0312673032168869e-01
+   28 -6.2799575211500369e-01 -1.4097313073755862e+00  3.2747938980616453e-02
+   29  9.1520300398844123e-01 -9.4801523858012704e-01  7.7037879134107223e-01
+run_energy: 41.28323739029462
+run_stress: ! |2-
+   8.8236221596506681e+01 -8.6492260623309562e+01 -1.7439609731970940e+00  9.0601855980531312e+01 -2.8735005690484968e+01 -2.6097632235197477e+00
+run_forces: ! |2
+    1  4.7316793853445830e+01  8.2815577813110188e+00 -3.2021703111755464e+01
+    2 -1.1508196824491330e+00 -9.3814982172707460e+00 -7.5761211707510139e+00
+    3 -5.1083163691832576e+00  5.2667553294971619e+01  5.1784852458007592e+01
+    4 -1.6078177452605999e+01 -2.4156048365236213e+01  1.3140924677013103e+01
+    5 -4.4915734474022280e+01 -4.8095168640411821e+01 -2.3331149037574161e+01
+    6  4.7077916942842350e+01 -9.5906213020090156e+00 -3.2570331503075487e+01
+    7 -1.6228599672412471e+01  1.4485102617342370e+01 -3.5441153194985300e+00
+    8 -6.5097893981550730e+00 -2.5117582302614530e+01  8.2131369512416001e+01
+    9 -1.5527440970965937e+01  1.6147270375910470e+01 -4.0812004993325646e-01
+   10  1.0070812216240984e+01  4.1571532807578805e+01 -6.5968810328796337e+01
+   11 -2.0431584971707451e+01 -6.4817395192247664e+00  8.9879981618991636e+00
+   12 -1.0884695976714678e+01 -1.1067390190389006e+01 -9.1551242768940568e+00
+   13  2.8052913970098801e+00  7.1296301666594912e+00  1.3173039168682621e-02
+   14  1.5254877537873529e+01 -8.9700095533297350e+00 -6.5719846903613162e+00
+   15  1.2392009100170984e+01 -6.0827695435257292e+00  1.7929674392339596e+01
+   16  4.7158712437377481e-01  1.0631038523396533e+01  6.0960085687560355e+00
+   17  1.4458707962589659e+00 -1.9708579331587350e+00  1.0634586790394520e+00
+   18  1.4201882413835909e-01  1.4265339757773337e+00 -5.7663956896747992e+00
+   19 -1.6609130686729365e+00 -2.0735307593211125e+00  2.4755525101127143e+00
+   20  1.5188942445345774e+00  6.4699678354377899e-01  3.2908431795620849e+00
+   21  3.2242729509516406e+00  4.0079233768386153e+00 -1.2047892238650988e+01
+   22 -5.7215184687399772e+00 -3.8871624402883409e+00  4.2679223469272234e+00
+   23  2.4972455177883366e+00 -1.2076093655027398e-01  7.7799698917237645e+00
+   24 -1.1661978296905471e+00  4.5271404898674854e+00 -2.6925565853370195e+00
+   25 -9.2712094527152167e-01 -3.1291890525017125e+00  5.1208215565053827e-01
+   26  2.0933187749620688e+00 -1.3979514373657731e+00  2.1804744296864813e+00
+   27 -2.6804542538020537e-01  2.1830651328698103e+00 -7.3931790038945400e-01
+   28 -5.7927072943128310e-01 -1.3052929090347909e+00  2.8365455885795865e-02
+   29  8.4731615481148848e-01 -8.7777222383501941e-01  7.1095244450365813e-01
+...
diff --git a/unittest/force-styles/tests/bond-lepton_nooffset.yaml b/unittest/force-styles/tests/bond-lepton_nooffset.yaml
new file mode 100644
index 0000000000..b39288640e
--- /dev/null
+++ b/unittest/force-styles/tests/bond-lepton_nooffset.yaml
@@ -0,0 +1,89 @@
+---
+lammps_version: 21 Nov 2023
+date_generated: Thu Jan 18 10:15:41 2024
+epsilon: 2.5e-13
+skip_tests:
+prerequisites: ! |
+  atom full
+  bond lepton
+pre_commands: ! ""
+post_commands: ! ""
+input_file: in.fourmol
+bond_style: lepton no_offset
+bond_coeff: ! |
+  1 1.5 "k*r^2; k=250.0"
+  2 1.1 "k2*r^2 + k3*r^3 + k4*r^4; k2=300.0; k3=-100.0; k4=50.0"
+  3 1.3 "k*r^2; k=350.0"
+  4 1.2 "k*(r-0.2)^2; k=500.0"
+  5 1.0 "k*r^2; k=450.0"
+equilibrium: 5 1.5 1.1 1.3 1.2 1
+extract: ! |
+  r0 1
+natoms: 29
+init_energy: 38.295825321689215
+init_stress: ! |-
+  -4.7778964706834920e+01 -9.3066674567350432e+01  3.4789470658440035e+02 -3.0023920169312170e+01 -8.0421418879842847e+01  5.8592449335969732e+01
+init_forces: ! |2
+    1 -5.9149914305071416e+00 -3.7728809612345245e+01 -2.7769433362963369e+01
+    2 -9.4281609567839944e+00 -7.7586487054273015e+00  1.1096676787527940e+01
+    3  3.2211742366572125e+01  2.7682361264425523e+01 -7.0109911672970497e+00
+    4  4.9260777576375503e+00 -1.3809750102765932e+00  3.4951785613141868e+00
+    5 -1.2606902198593501e+00 -1.9373397933007170e+00  6.4372463095041841e+00
+    6 -3.8858476307965482e+01  6.8567296300319640e+01  1.9889888806614337e+02
+    7  7.5297927100028144e+00 -3.8622600737556944e+01 -1.9268793182212875e+02
+    8  1.3018665172824681e+01 -1.2902789438539877e+01  3.2406676637830003e+00
+    9  7.4343536239661590e-01  8.0072549738604493e-01  3.2899591078538779e+00
+   10  6.1558871886113291e+00 -2.2419470219698296e+00  1.0080175092279852e+01
+   11 -3.7020922615305768e-01 -9.1704102274126453e-01 -1.5046795827370363e+00
+   12  5.2437190958790678e+00  3.4225915524442998e+00 -2.5523597276998897e+00
+   13 -1.1277007635800260e+01  4.4610677459696646e+00  2.1195215396108269e-01
+   14  2.9813926585641828e+00 -6.0667387499775116e-01  7.7317115100728788e+00
+   15  2.5872825164662799e-01 -9.9415365173790704e+00 -3.5428115826174169e+00
+   16  5.2775953236493464e+01 -3.1855535724919463e+01 -1.6524229620195118e+02
+   17 -5.8735858023559175e+01  4.0959855098908882e+01  1.5582804819495431e+02
+   18 -9.0963607969319646e+00 -4.3343406270234155e+00 -1.7623055551859267e+01
+   19  1.2597490501067170e+01  8.0591915019111742e+00  1.5261489294231819e+01
+   20 -3.5011297041352050e+00 -3.7248508748877587e+00  2.3615662576274494e+00
+   21 -1.5332952658285048e+00  5.9630208068632040e-01 -7.4967230017303281e+00
+   22  4.2380253233105529e+00  1.0270453290850614e+00  6.6489894421385651e+00
+   23 -2.7047300574820481e+00 -1.6233474097713818e+00  8.4773355959176278e-01
+   24 -6.6588083188726532e+00  3.5110922792825918e+00 -6.5625174267043489e+00
+   25  7.9844426562464141e+00 -1.2853795683286129e+00  6.7123710742192300e+00
+   26 -1.3256343373737607e+00 -2.2257127109539789e+00 -1.4985364751488087e-01
+   27  6.6999960289138851e+00  6.3808952243186141e+00  2.0100808779497248e+00
+   28 -8.8466157439236681e-01  3.8018717064230995e-01 -5.9857060538593476e-01
+   29 -5.8153344545215182e+00 -6.7610823949609244e+00 -1.4115102725637900e+00
+run_energy: 37.78424389351509
+run_stress: ! |-
+  -4.6127506998693484e+01 -9.2129732247211749e+01  3.4548310342284810e+02 -2.9841348469661163e+01 -7.8434962689387717e+01  5.9253167412123155e+01
+run_forces: ! |2
+    1 -5.8451208652159004e+00 -3.7483084455000643e+01 -2.7706576989352534e+01
+    2 -9.4646964278974774e+00 -7.8058897724822449e+00  1.1098831256058579e+01
+    3  3.1827086102630346e+01  2.7573911030624821e+01 -6.9576662575837211e+00
+    4  5.1502169659901655e+00 -1.4367546726785101e+00  3.6631301025186187e+00
+    5 -1.2208420775139264e+00 -1.8781699435112362e+00  6.2332639085051911e+00
+    6 -3.8491523409043303e+01  6.8063273218541468e+01  1.9723141045830272e+02
+    7  7.4838209349394775e+00 -3.8394258853636330e+01 -1.9092625515909930e+02
+    8  1.2676329319901857e+01 -1.2475162287097550e+01  3.3659783337736577e+00
+    9  6.8845241565874460e-01  7.3814593866184031e-01  3.0434095400342533e+00
+   10  6.2545583994797553e+00 -2.9600470917047201e+00  9.4247125735981765e+00
+   11 -1.9554747834212524e-01 -4.8434314068172696e-01 -7.9452259566032057e-01
+   12  5.2092795750960841e+00  3.1431929551776721e+00 -3.1346654851373348e+00
+   13 -1.1496483840617872e+01  4.5245217971580018e+00  2.1348220240918236e-01
+   14  3.1913399826660909e+00 -6.3760720126489068e-01  8.2740980433927742e+00
+   15  2.7338564489784484e-01 -9.7206665011069671e+00 -3.4841809697094543e+00
+   16  5.2461611410918316e+01 -3.1639255494702798e+01 -1.6483607587596811e+02
+   17 -5.8501866653548078e+01  4.0872194473703807e+01  1.5529162691391761e+02
+   18 -7.0990354207248405e+00 -2.4743922643289666e+00 -1.7824398936159682e+01
+   19  1.2019842510974870e+01  7.7105128268768715e+00  1.4523712108141252e+01
+   20 -4.9208070902500296e+00 -5.2361205625479048e+00  3.3006868280184283e+00
+   21 -1.8548628650934149e+00  2.7467524264262122e-01 -6.7601469408617412e+00
+   22  3.9136757840663186e+00  9.5561415744904055e-01  6.1181929861632272e+00
+   23 -2.0588129189729036e+00 -1.2302894000916618e+00  6.4195395469851357e-01
+   24 -5.7681973234153086e+00  2.0209144998436366e+00 -5.2864044021513967e+00
+   25  6.3696975292216704e+00 -1.0109756418053095e+00  5.3564043759405795e+00
+   26 -6.0150020580636188e-01 -1.0099388580383271e+00 -6.9999973789182365e-02
+   27  6.8467535469188450e+00  5.7500299184200578e+00  2.2775780974490298e+00
+   28 -1.3929430925479587e+00  5.9772788540443345e-01 -9.4056106886485980e-01
+   29 -5.4538104543708865e+00 -6.3477578038244911e+00 -1.3370170285841700e+00
+...
diff --git a/unittest/force-styles/tests/mol-pair-lepton.yaml b/unittest/force-styles/tests/mol-pair-lepton.yaml
index 03117a9aa5..33576e81c2 100644
--- a/unittest/force-styles/tests/mol-pair-lepton.yaml
+++ b/unittest/force-styles/tests/mol-pair-lepton.yaml
@@ -1,6 +1,6 @@
 ---
-lammps_version: 22 Dec 2022
-date_generated: Thu Dec 22 09:57:30 2022
+lammps_version: 21 Nov 2023
+date_generated: Thu Jan 18 11:01:50 2024
 epsilon: 5e-14
 skip_tests: intel
 prerequisites: ! |
@@ -23,23 +23,24 @@ pair_coeff: ! |
   2 4    "4.0*eps*((sig/r)^12-(sig/r)^6);eps=0.005;sig=0.5"
   2 5    "4.0*eps*((sig/r)^12-(sig/r)^6);eps=0.00866025;sig=2.05"
   3 3    "4.0*eps*((sig/r)^12-(sig/r)^6);eps=0.02;sig=3.2"
-  3 4    "4.0*eps*((sig/r)^12-(sig/r)^6);eps=0.0173205;sig=3.15"
+  3 4    "-eps*r;eps=0.0173205;sig=3.15"
   3 5    "4.0*eps*((sig/r)^12-(sig/r)^6);eps=0.0173205;sig=3.15"
+  4 4    "10.0"
 extract: ! ""
 natoms: 29
-init_vdwl: 749.2468149791969
+init_vdwl: 746.1575578155301
 init_coul: 0
 init_stress: ! |2-
-   2.1793853434038242e+03  2.1988955172192768e+03  4.6653977523326257e+03 -7.5956547636050584e+02  2.4751536734032861e+01  6.6652028436400667e+02
+   2.1723526811665593e+03  2.1959162890293533e+03  4.6328064825512138e+03 -7.5509180369489252e+02  9.4506578600439983e+00  6.7585028859193505e+02
 init_forces: ! |2
-    1 -2.3333390280895912e+01  2.6994567613322641e+02  3.3272827850356805e+02
+    1 -2.3359983837422618e+01  2.6996030011590727e+02  3.3274783233743295e+02
     2  1.5828554630414899e+02  1.3025008843535872e+02 -1.8629682358935722e+02
     3 -1.3528903738169066e+02 -3.8704313358319990e+02 -1.4568978437133106e+02
     4 -7.8711096705893366e+00  2.1350518625373538e+00 -5.5954532185548134e+00
     5 -2.5176757268228540e+00 -4.0521510681020239e+00  1.2152704057877019e+01
     6 -8.3190662465252137e+02  9.6394149462625603e+02  1.1509093566509248e+03
-    7  5.8203388932513583e+01 -3.3608997951626793e+02 -1.7179617996573040e+03
-    8  1.4451392284291535e+02 -1.0927475861088995e+02  3.9990593492420442e+02
+    7  6.6340523101244187e+01 -3.4078810185436379e+02 -1.7003039516942540e+03
+    8  1.3674478037618434e+02 -1.0517874373121482e+02  3.8291074246191346e+02
     9  7.9156945283097443e+01  8.5273009783986538e+01  3.5032175698445189e+02
    10  5.3118875219105360e+02 -6.1040990859419412e+02 -1.8355872642619292e+02
    11 -2.3530157267965532e+00 -5.9077640073819717e+00 -9.6590723955414290e+00
@@ -48,8 +49,8 @@ init_forces: ! |2
    14 -3.3852721292265153e+00  6.8636181241903649e-01 -8.7507190862499868e+00
    15 -2.0454999188605300e-01  8.4846165523049883e+00  3.0131615419406712e+00
    16  4.6326310311812108e+02 -3.3087715736498188e+02 -1.1893024561782554e+03
-   17 -4.5334300923766727e+02  3.1554283255882569e+02  1.2058417793481203e+03
-   18 -1.8862623280672661e-02 -3.3402010907951661e-02  3.1000479299095260e-02
+   17 -4.5371128972368928e+02  3.1609940794953951e+02  1.2052011419527653e+03
+   18  8.0197172683943874e-03 -2.4939258820032362e-03 -1.0571459969936936e-02
    19  3.1843079640570047e-04 -2.3918627818763426e-04  1.7427252638513439e-03
    20 -9.9760831209706009e-04 -1.0209184826753090e-03  3.6910972636601454e-04
    21 -7.1566125273265186e+01 -8.1615678329920655e+01  2.2589561408339890e+02
@@ -61,38 +62,38 @@ init_forces: ! |2
    27  5.1810388677546001e+01 -2.2705458321213797e+02  9.0849111082069669e+01
    28 -1.8041307121444069e+02  7.7534042932772905e+01 -1.2206956760706598e+02
    29  1.2861057254925012e+02  1.4952711274394568e+02  3.1216025556267880e+01
-run_vdwl: 719.4530651193046
+run_vdwl: 716.5213000416621
 run_coul: 0
 run_stress: ! |2-
-   2.1330153957371017e+03  2.1547728168285516e+03  4.3976497417710125e+03 -7.3873328448298525e+02  4.1743821105370067e+01  6.2788012209191027e+02
+   2.1263870112744726e+03  2.1520080341389726e+03  4.3663519512361027e+03 -7.3456213833770062e+02  2.6927285459244832e+01  6.3691834104928068e+02
 run_forces: ! |2
-    1 -2.0299419751359164e+01  2.6686193378823020e+02  3.2358785870694015e+02
-    2  1.5298617928491225e+02  1.2596516341409203e+02 -1.7961292655338619e+02
-    3 -1.3353630652439830e+02 -3.7923748696131315e+02 -1.4291839793625817e+02
-    4 -7.8374717836161762e+00  2.1276610789823409e+00 -5.5845014473820616e+00
-    5 -2.5014258630866735e+00 -4.0250131424704412e+00  1.2103512372025639e+01
-    6 -8.0681462887292457e+02  9.2165637136761688e+02  1.0270795806932783e+03
-    7  5.5780279349903523e+01 -3.1117530951561656e+02 -1.5746991292869018e+03
-    8  1.3452983055535049e+02 -1.0064659350255911e+02  3.8851791558207651e+02
-    9  7.6746213883425980e+01  8.2501469877402130e+01  3.3944351200617882e+02
-   10  5.2128033527695595e+02 -5.9920098848285863e+02 -1.8126029815043339e+02
-   11 -2.3573118090915246e+00 -5.8616944550888359e+00 -9.6049808811326205e+00
-   12  1.7503975847822900e+01  1.0626930310560814e+01 -8.0603160272054968e+00
-   13  8.0530313322973104e+00 -3.1756495170399117e+00 -1.4618315664740528e-01
-   14 -3.3416065168069773e+00  6.6492606336082150e-01 -8.6345131440469700e+00
-   15 -2.2253843262374914e-01  8.5025661635348779e+00  3.0369735873081622e+00
-   16  4.3476311264989465e+02 -3.1171086735551415e+02 -1.1135217194927448e+03
-   17 -4.2469846140777133e+02  2.9615411776780593e+02  1.1302573488400669e+03
-   18 -1.8849981672825908e-02 -3.3371636477421307e-02  3.0986293443778727e-02
-   19  3.0940277774414027e-04 -2.4634536455373044e-04  1.7433360008861016e-03
-   20 -9.8648131277150790e-04 -1.0112587134526946e-03  3.6932948773965417e-04
-   21 -7.0490745283106378e+01 -7.9749153581142139e+01  2.2171003384646431e+02
-   22 -1.0638717908920071e+02 -2.5949502163177968e+01 -1.6645589526812276e+02
-   23  1.7686797710735027e+02  1.0571018898885514e+02 -5.5243337084099387e+01
-   24  3.8206017656281375e+01 -2.1022820141992960e+02  1.1260711266189014e+02
-   25 -1.4918881473530880e+02  2.3762151395876508e+01 -1.2549188139143085e+02
-   26  1.1097059498808308e+02  1.8645503634228518e+02  1.2861559677865248e+01
-   27  5.0800844984832125e+01 -2.2296588090685469e+02  8.8607367716323253e+01
-   28 -1.7694190504288886e+02  7.6029945485182026e+01 -1.1950518150242071e+02
-   29  1.2614894925528141e+02  1.4694250820033548e+02  3.0893386672863034e+01
+    1 -2.0326040164905073e+01  2.6687684422507328e+02  3.2360752654223910e+02
+    2  1.5298608857690186e+02  1.2596506573447739e+02 -1.7961281277841888e+02
+    3 -1.3353631293077220e+02 -3.7923732277833739e+02 -1.4291833260989750e+02
+    4 -7.8374717116975035e+00  2.1276610267113969e+00 -5.5845014524498486e+00
+    5 -2.5014258756924157e+00 -4.0250131713717776e+00  1.2103512280982228e+01
+    6 -8.0714971444536457e+02  9.2203068890526424e+02  1.0274502514782534e+03
+    7  6.3722543724608350e+01 -3.1586173092061807e+02 -1.5580743968587681e+03
+    8  1.2737293861904031e+02 -9.6945064279519002e+01  3.7231518354375891e+02
+    9  7.6709940036396304e+01  8.2451980339096536e+01  3.3926849385746954e+02
+   10  5.2123408713149831e+02 -5.9914309504622599e+02 -1.8121478407355445e+02
+   11 -2.3573086824741427e+00 -5.8616969504300931e+00 -9.6049799947287671e+00
+   12  1.7504108236707797e+01  1.0626901299509713e+01 -8.0602444903747301e+00
+   13  8.0530313558451159e+00 -3.1756495145404533e+00 -1.4618321144421534e-01
+   14 -3.3416062225209915e+00  6.6492609500227240e-01 -8.6345136470911594e+00
+   15 -2.2253820242887132e-01  8.5025660110994483e+00  3.0369741645942137e+00
+   16  4.3476708820318731e+02 -3.1171425443331651e+02 -1.1135289618967258e+03
+   17 -4.2507048343681140e+02  2.9671384825884064e+02  1.1296230654445915e+03
+   18  8.0130752607770750e-03 -2.4895867517657545e-03 -1.0574351684568857e-02
+   19  3.0939970262803125e-04 -2.4635874092791046e-04  1.7433490521479268e-03
+   20 -9.8648319666298735e-04 -1.0112621691758337e-03  3.6933139856766442e-04
+   21 -7.0490745298133859e+01 -7.9749153568373742e+01  2.2171003384665224e+02
+   22 -1.0638717908973166e+02 -2.5949502162671845e+01 -1.6645589526807785e+02
+   23  1.7686797710711278e+02  1.0571018898899243e+02 -5.5243337084327727e+01
+   24  3.8206017659583978e+01 -2.1022820135505594e+02  1.1260711269986750e+02
+   25 -1.4918881473631544e+02  2.3762151403215309e+01 -1.2549188138812220e+02
+   26  1.1097059498835199e+02  1.8645503634383900e+02  1.2861559678659969e+01
+   27  5.0800844960383969e+01 -2.2296588092255456e+02  8.8607367714616288e+01
+   28 -1.7694190504410764e+02  7.6029945484553380e+01 -1.1950518150262033e+02
+   29  1.2614894924957088e+02  1.4694250819500266e+02  3.0893386676150566e+01
 ...
diff --git a/unittest/formats/test_molecule_file.cpp b/unittest/formats/test_molecule_file.cpp
index 8fe1fc1eab..c798d2f4c2 100644
--- a/unittest/formats/test_molecule_file.cpp
+++ b/unittest/formats/test_molecule_file.cpp
@@ -32,6 +32,8 @@ using testing::StrEq;
 
 using utils::split_words;
 
+const double EPSILON = 5.0e-14;
+
 #define test_name test_info_->name()
 
 static void create_molecule_files(const std::string &h2o_filename, const std::string &co2_filename)
@@ -145,7 +147,7 @@ protected:
         fclose(fp);
 
         command(fmt::format("molecule {} {} {}", name, file, args));
-        remove(file.c_str());
+        platform::unlink(file.c_str());
     }
 };
 
@@ -184,7 +186,7 @@ TEST_F(MoleculeFileTest, badargs)
     TEST_FAILURE(
         ".*Illegal molecule command.*",
         run_mol_cmd(test_name, "scale", "Comment\n1 atoms\n\n Coords\n\n 1 0.0 0.0 0.0\n"););
-    remove("badargs.mol");
+    platform::unlink("moltest_badargs.mol");
 }
 
 TEST_F(MoleculeFileTest, noatom)
@@ -193,14 +195,14 @@ TEST_F(MoleculeFileTest, noatom)
                  run_mol_cmd(test_name, "",
                              "Comment\n0 atoms\n1 bonds\n\n"
                              " Coords\n\nBonds\n\n 1 1 2\n"););
-    remove("noatom.mol");
+    platform::unlink("moltest_noatom.mol");
 }
 
 TEST_F(MoleculeFileTest, empty)
 {
     TEST_FAILURE(".*ERROR: Unexpected end of molecule file.*",
                  run_mol_cmd(test_name, "", "Comment\n\n"););
-    remove("empty.mol");
+    platform::unlink("moltest_empty.mol");
 }
 
 TEST_F(MoleculeFileTest, nospecial)
@@ -210,7 +212,7 @@ TEST_F(MoleculeFileTest, nospecial)
                              "Comment\n3 atoms\n\n2 bonds\n\n"
                              " Coords\n\n 1 1.0 1.0 1.0\n 2 1.0 1.0 0.0\n 3 1.0 0.0 1.0\n"
                              " Bonds\n\n 1 1 1 2\n 2 1 1 3\n"););
-    remove("nospecial.mol");
+    platform::unlink("moltest_nospecial.mol");
 }
 
 TEST_F(MoleculeFileTest, minimal)
@@ -218,7 +220,7 @@ TEST_F(MoleculeFileTest, minimal)
     BEGIN_CAPTURE_OUTPUT();
     run_mol_cmd(test_name, "", "Comment\n1 atoms\n\n Coords\n\n 1 0.0 0.0 0.0\n");
     auto output = END_CAPTURE_OUTPUT();
-    ASSERT_THAT(output, ContainsRegex(".*Read molecule template.*\n.*1 molecules.*\n"
+    ASSERT_THAT(output, ContainsRegex(".*Read molecule template.*\n.*Comment.*\n.*1 molecules.*\n"
                                       ".*0 fragments.*\n.*1 atoms.*\n.*0 bonds.*"));
 }
 
@@ -230,7 +232,7 @@ TEST_F(MoleculeFileTest, notype)
     command("create_box 1 box");
     run_mol_cmd(test_name, "", "Comment\n1 atoms\n\n Coords\n\n 1 0.0 0.0 0.0\n");
     auto output = END_CAPTURE_OUTPUT();
-    ASSERT_THAT(output, ContainsRegex(".*Read molecule template.*\n.*1 molecules.*\n"
+    ASSERT_THAT(output, ContainsRegex(".*Read molecule template.*\n.*Comment.*\n.*1 molecules.*\n"
                                       ".*0 fragments.*\n.*1 atoms.*\n.*0 bonds.*"));
     TEST_FAILURE(".*ERROR: Create_atoms molecule must have atom types.*",
                  command("create_atoms 0 single 0.0 0.0 0.0 mol notype 542465"););
@@ -259,7 +261,7 @@ TEST_F(MoleculeFileTest, twomols)
                 " Coords\n\n 1 0.0 0.0 0.0\n 2 0.0 0.0 1.0\n"
                 " Molecules\n\n 1 1\n 2 2\n\n Types\n\n 1 1\n 2 2\n\n");
     auto output = END_CAPTURE_OUTPUT();
-    ASSERT_THAT(output, ContainsRegex(".*Read molecule template.*\n.*2 molecules.*\n"
+    ASSERT_THAT(output, ContainsRegex(".*Read molecule template.*\n.*Comment.*\n.*2 molecules.*\n"
                                       ".*0 fragments.*\n.*2 atoms with max type 2.*\n.*0 bonds.*"));
     ASSERT_EQ(lmp->atom->nmolecule, 1);
     auto mols = lmp->atom->get_molecule_by_id(test_name);
@@ -273,10 +275,10 @@ TEST_F(MoleculeFileTest, twofiles)
     auto output = END_CAPTURE_OUTPUT();
     ASSERT_THAT(
         output,
-        ContainsRegex(".*Read molecule template twomols:.*\n.*1 molecules.*\n"
+        ContainsRegex(".*Read molecule template twomols:.*\n.*Water.*\n.*1 molecules.*\n"
                       ".*0 fragments.*\n.*3 atoms with max type 2.*\n.*2 bonds with max type 1.*\n"
                       ".*1 angles with max type 1.*\n.*0 dihedrals.*\n.*0 impropers.*\n"
-                      ".*Read molecule template twomols:.*\n.*1 molecules.*\n"
+                      ".*Read molecule template twomols:.*\n.*CO2.*\n.*1 molecules.*\n"
                       ".*0 fragments.*\n.*3 atoms with max type 4.*\n.*2 bonds with max type 2.*\n"
                       ".*1 angles with max type 2.*\n.*0 dihedrals.*"));
     BEGIN_CAPTURE_OUTPUT();
@@ -306,7 +308,7 @@ TEST_F(MoleculeFileTest, labelmap)
     auto output = END_CAPTURE_OUTPUT();
     ASSERT_THAT(
         output,
-        ContainsRegex(".*Read molecule template h2olabel:.*\n.*1 molecules.*\n"
+        ContainsRegex(".*Read molecule template h2olabel:.*\n.*Water.*\n.*1 molecules.*\n"
                       ".*0 fragments.*\n.*3 atoms with max type 2.*\n.*2 bonds with max type 1.*\n"
                       ".*1 angles with max type 1.*\n.*0 dihedrals.*\n.*0 impropers.*"));
     BEGIN_CAPTURE_OUTPUT();
@@ -314,7 +316,7 @@ TEST_F(MoleculeFileTest, labelmap)
     output = END_CAPTURE_OUTPUT();
     ASSERT_THAT(
         output,
-        ContainsRegex(".*Read molecule template co2label:.*\n.*1 molecules.*\n"
+        ContainsRegex(".*Read molecule template co2label:.*\n.*CO2.*\n.*1 molecules.*\n"
                       ".*0 fragments.*\n.*3 atoms with max type 4.*\n.*2 bonds with max type 2.*\n"
                       ".*1 angles with max type 2.*\n.*0 dihedrals.*"));
     BEGIN_CAPTURE_OUTPUT();
@@ -328,12 +330,12 @@ TEST_F(MoleculeFileTest, labelmap)
     auto second = output.substr(mark);
     ASSERT_THAT(
         first,
-        ContainsRegex(".*Read molecule template h2onum:.*\n.*1 molecules.*\n"
+        ContainsRegex(".*Read molecule template h2onum:.*\n.*Water.*\n.*1 molecules.*\n"
                       ".*0 fragments.*\n.*3 atoms with max type 2.*\n.*2 bonds with max type 1.*\n"
                       ".*1 angles with max type 1.*\n.*0 dihedrals.*\n.*0 impropers.*\n"));
     ASSERT_THAT(
         second,
-        ContainsRegex(".*Read molecule template co2num:.*\n.*1 molecules.*\n"
+        ContainsRegex(".*Read molecule template co2num:.*\n.*CO2.*\n.*1 molecules.*\n"
                       ".*0 fragments.*\n.*3 atoms with max type 4.*\n.*2 bonds with max type 2.*\n"
                       ".*1 angles with max type 2.*\n.*0 dihedrals.*"));
     ASSERT_EQ(lmp->atom->nmolecule, 4);
@@ -379,7 +381,7 @@ TEST_F(MoleculeFileTest, bonds)
                 " 1 1 1 2\n"
                 " 2 2 1 3\n\n");
     auto output = END_CAPTURE_OUTPUT();
-    ASSERT_THAT(output, ContainsRegex(".*Read molecule template.*\n.*1 molecules.*\n"
+    ASSERT_THAT(output, ContainsRegex(".*Read molecule template.*\n.*Comment.*\n.*1 molecules.*\n"
                                       ".*0 fragments.*\n.*4 atoms.*type.*2.*\n"
                                       ".*2 bonds.*type.*2.*\n.*0 angles.*"));
 
@@ -404,6 +406,60 @@ TEST_F(MoleculeFileTest, bonds)
     END_HIDE_OUTPUT();
 }
 
+TEST_F(MoleculeFileTest, dipoles)
+{
+    if (!LAMMPS::is_installed_pkg("DIPOLE")) GTEST_SKIP();
+    BEGIN_CAPTURE_OUTPUT();
+    command("atom_style dipole");
+    command("region box block 0 1 0 1 0 1");
+    command("create_box 2 box");
+    run_mol_cmd(test_name, "",
+                "# Dumbbell with dipole molecule file.\n\n"
+                "2 atoms\n\n"
+                "Coords\n\n1 -1.0 0.0 0.0\n2  1.0 0.0 0.0\n\n"
+                "Types\n\n1 1\n2 2\n\n"
+                "Dipoles\n\n1 1.0 0.0 0.0\n2 1.0 1.0 0.0\n\n");
+    auto output = END_CAPTURE_OUTPUT();
+    ASSERT_THAT(output, ContainsRegex(".*Read molecule template.*\n.*Dumbbell.*\n.*1 molecules.*\n"
+                                      ".*0 fragments.*\n.*2 atoms.*type.*2.*\n"));
+
+    BEGIN_CAPTURE_OUTPUT();
+    command("mass * 1.0");
+    command("create_atoms 0 single 0.5 0.5 0.5 mol dipoles 67235 rotate 90.0 0.0 0.0 1.0");
+    output = END_CAPTURE_OUTPUT();
+    ASSERT_THAT(output, ContainsRegex(".*Created 2 atoms.*"));
+
+    Molecule *mol = lmp->atom->molecules[0];
+    ASSERT_EQ(mol->natoms, 2);
+    ASSERT_EQ(lmp->atom->natoms, 2);
+    mol->compute_mass();
+    mol->compute_com();
+    EXPECT_NEAR(mol->masstotal, 2.0, EPSILON);
+    EXPECT_NEAR(mol->com[0], 0.0, EPSILON);
+    EXPECT_NEAR(mol->com[1], 0.0, EPSILON);
+    EXPECT_NEAR(mol->com[2], 0.0, EPSILON);
+    EXPECT_EQ(mol->comatom, 1);
+    ASSERT_NE(mol->mu, nullptr);
+    EXPECT_NEAR(mol->mu[0][0], 1.0, EPSILON);
+    EXPECT_NEAR(mol->mu[0][1], 0.0, EPSILON);
+    EXPECT_NEAR(mol->mu[0][2], 0.0, EPSILON);
+    EXPECT_NEAR(mol->mu[1][0], 1.0, EPSILON);
+    EXPECT_NEAR(mol->mu[1][1], 1.0, EPSILON);
+    EXPECT_NEAR(mol->mu[1][2], 0.0, EPSILON);
+    EXPECT_NEAR(mol->maxextent, 2.0, EPSILON);
+    // dipoles should be rotated by 90 degrees clockwise around the z axis
+    double **mu = lmp->atom->mu;
+    ASSERT_NE(mu, nullptr);
+    EXPECT_NEAR(mu[0][0], 0.0, EPSILON);
+    EXPECT_NEAR(mu[0][1], 1.0, EPSILON);
+    EXPECT_NEAR(mu[0][2], 0.0, EPSILON);
+    EXPECT_NEAR(mu[0][3], 1.0, EPSILON);
+    EXPECT_NEAR(mu[1][0], -1.0, EPSILON);
+    EXPECT_NEAR(mu[1][1], 1.0, EPSILON);
+    EXPECT_NEAR(mu[1][2], 0.0, EPSILON);
+    EXPECT_NEAR(mu[1][3], sqrt(2.0), EPSILON);
+}
+
 int main(int argc, char **argv)
 {
     MPI_Init(&argc, &argv);
diff --git a/unittest/utils/test_lepton.cpp b/unittest/utils/test_lepton.cpp
index a9fa6e3543..55d3bf8351 100644
--- a/unittest/utils/test_lepton.cpp
+++ b/unittest/utils/test_lepton.cpp
@@ -542,6 +542,41 @@ TEST(Lepton, Optimize)
     out.str("");
 }
 
+TEST(Lepton, Exception)
+{
+    Lepton::CompiledExpression function, derivative;
+
+    auto parsed = Lepton::Parser::parse("x*x");
+    function    = parsed.createCompiledExpression();
+    derivative  = parsed.differentiate("x").createCompiledExpression();
+
+    double x = 1.5;
+    EXPECT_NO_THROW(function.getVariableReference("x") = x;);
+    EXPECT_NO_THROW(derivative.getVariableReference("x") = x;);
+    EXPECT_DOUBLE_EQ(function.evaluate(), 2.25);
+    EXPECT_DOUBLE_EQ(derivative.evaluate(), 3.0);
+
+    parsed     = Lepton::Parser::parse("x");
+    function   = parsed.createCompiledExpression();
+    derivative = parsed.differentiate("x").createCompiledExpression();
+
+    x = 2.5;
+    EXPECT_NO_THROW(function.getVariableReference("x") = x;);
+    EXPECT_THROW(derivative.getVariableReference("x") = x;, Lepton::Exception);
+    EXPECT_DOUBLE_EQ(function.evaluate(), 2.5);
+    EXPECT_DOUBLE_EQ(derivative.evaluate(), 1.0);
+
+    parsed     = Lepton::Parser::parse("1.0");
+    function   = parsed.createCompiledExpression();
+    derivative = parsed.differentiate("x").createCompiledExpression();
+
+    x = 0.5;
+    EXPECT_THROW(function.getVariableReference("x") = x;, Lepton::Exception);
+    EXPECT_THROW(derivative.getVariableReference("x") = x;, Lepton::Exception);
+    EXPECT_DOUBLE_EQ(function.evaluate(), 1.0);
+    EXPECT_DOUBLE_EQ(derivative.evaluate(), 0.0);
+}
+
 int main(int argc, char **argv)
 {
     MPI_Init(&argc, &argv);