diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index aaa784ca8b..a57715d294 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -217,13 +217,20 @@ elseif(GPU_API STREQUAL "OPENCL")
 elseif(GPU_API STREQUAL "HIP")
   if(NOT DEFINED HIP_PATH)
       if(NOT DEFINED ENV{HIP_PATH})
-          set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+          set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to HIP installation")
       else()
-          set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+          set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to HIP installation")
       endif()
   endif()
-  set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
-  find_package(HIP REQUIRED)
+  if(NOT DEFINED ROCM_PATH)
+      if(NOT DEFINED ENV{ROCM_PATH})
+          set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation")
+      else()
+          set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to ROCm installation")
+      endif()
+  endif()
+  list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH})
+  find_package(hip REQUIRED)
   option(HIP_USE_DEVICE_SORT "Use GPU sorting" ON)
 
   if(NOT DEFINED HIP_PLATFORM)
@@ -325,10 +332,11 @@ elseif(GPU_API STREQUAL "HIP")
 
   set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h ${LAMMPS_LIB_BINARY_DIR}/gpu/*.cu.cpp")
 
-  hip_add_library(gpu STATIC ${GPU_LIB_SOURCES})
+  add_library(gpu STATIC ${GPU_LIB_SOURCES})
   target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu)
   target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
   target_compile_definitions(gpu PRIVATE -DUSE_HIP)
+  target_link_libraries(gpu PRIVATE hip::host)
 
   if(HIP_USE_DEVICE_SORT)
     # add hipCUB
@@ -377,8 +385,9 @@ elseif(GPU_API STREQUAL "HIP")
     endif()
   endif()
 
-  hip_add_executable(hip_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
+  add_executable(hip_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
   target_compile_definitions(hip_get_devices PRIVATE -DUCL_HIP)
+  target_link_libraries(hip_get_devices hip::host)
 
   if(HIP_PLATFORM STREQUAL "nvcc")
     target_compile_definitions(gpu PRIVATE -D__HIP_PLATFORM_NVCC__)
diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake
index a1cf680266..d5fccad4ba 100644
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@@ -1,6 +1,8 @@
 ########################################################################
 # As of version 3.3.0 Kokkos requires C++14
-set(CMAKE_CXX_STANDARD 14)
+if(CMAKE_CXX_STANDARD LESS 14)
+  set(CMAKE_CXX_STANDARD 14)
+endif()
 ########################################################################
 # consistency checks and Kokkos options/settings required by LAMMPS
 if(Kokkos_ENABLE_CUDA)
diff --git a/cmake/Modules/Packages/LATTE.cmake b/cmake/Modules/Packages/LATTE.cmake
index ddf31a68ed..a96e850f7e 100644
--- a/cmake/Modules/Packages/LATTE.cmake
+++ b/cmake/Modules/Packages/LATTE.cmake
@@ -19,6 +19,14 @@ if(DOWNLOAD_LATTE)
   set(LATTE_MD5 "820e73a457ced178c08c71389a385de7" CACHE STRING "MD5 checksum of LATTE tarball")
   mark_as_advanced(LATTE_URL)
   mark_as_advanced(LATTE_MD5)
+
+  # CMake cannot pass BLAS or LAPACK library variable to external project if they are a list
+  list(LENGTH BLAS_LIBRARIES} NUM_BLAS)
+  list(LENGTH LAPACK_LIBRARIES NUM_LAPACK)
+  if((NUM_BLAS GREATER 1) OR (NUM_LAPACK GREATER 1))
+    message(FATAL_ERROR "Cannot compile downloaded LATTE library due to a technical limitation")
+  endif()
+
   include(ExternalProject)
   ExternalProject_Add(latte_build
     URL     ${LATTE_URL}
diff --git a/cmake/Modules/Packages/ML-HDNNP.cmake b/cmake/Modules/Packages/ML-HDNNP.cmake
index 44873b9929..e27b3a1410 100644
--- a/cmake/Modules/Packages/ML-HDNNP.cmake
+++ b/cmake/Modules/Packages/ML-HDNNP.cmake
@@ -45,12 +45,12 @@ if(DOWNLOAD_N2P2)
     # get path to MPI include directory when cross-compiling to windows
     if((CMAKE_SYSTEM_NAME STREQUAL Windows) AND CMAKE_CROSSCOMPILING)
       get_target_property(N2P2_MPI_INCLUDE MPI::MPI_CXX INTERFACE_INCLUDE_DIRECTORIES)
-      set(N2P2_PROJECT_OPTIONS "-I ${N2P2_MPI_INCLUDE} -DMPICH_SKIP_MPICXX=1")
+      set(N2P2_PROJECT_OPTIONS "-I${N2P2_MPI_INCLUDE}")
       set(MPI_CXX_COMPILER ${CMAKE_CXX_COMPILER})
     endif()
     if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
       get_target_property(N2P2_MPI_INCLUDE MPI::MPI_CXX INTERFACE_INCLUDE_DIRECTORIES)
-      set(N2P2_PROJECT_OPTIONS "-I ${N2P2_MPI_INCLUDE} -DMPICH_SKIP_MPICXX=1")
+      set(N2P2_PROJECT_OPTIONS "-I${N2P2_MPI_INCLUDE}")
       set(MPI_CXX_COMPILER ${CMAKE_CXX_COMPILER})
     endif()
   endif()
@@ -69,6 +69,12 @@ if(DOWNLOAD_N2P2)
   # echo final flag for debugging
   message(STATUS "N2P2 BUILD OPTIONS: ${N2P2_BUILD_OPTIONS}")
 
+  # must have "sed" command to compile n2p2 library (for now)
+  find_program(HAVE_SED sed)
+  if(NOT HAVE_SED)
+    message(FATAL_ERROR "Must have 'sed' program installed to compile 'n2p2' library for ML-HDNNP package")
+  endif()
+
   # download compile n2p2 library. much patch MPI calls in LAMMPS interface to accommodate MPI-2 (e.g. for cross-compiling)
   include(ExternalProject)
   ExternalProject_Add(n2p2_build
diff --git a/cmake/Modules/Packages/ML-QUIP.cmake b/cmake/Modules/Packages/ML-QUIP.cmake
index 5a80e63d55..92418e8939 100644
--- a/cmake/Modules/Packages/ML-QUIP.cmake
+++ b/cmake/Modules/Packages/ML-QUIP.cmake
@@ -50,7 +50,7 @@ if(DOWNLOAD_QUIP)
     GIT_TAG origin/public
     GIT_SHALLOW YES
     GIT_PROGRESS YES
-    PATCH_COMMAND cp ${CMAKE_BINARY_DIR}/quip.config <SOURCE_DIR>/arch/Makefile.lammps
+    PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_BINARY_DIR}/quip.config <SOURCE_DIR>/arch/Makefile.lammps
     CONFIGURE_COMMAND env QUIP_ARCH=lammps make config
     BUILD_COMMAND env QUIP_ARCH=lammps make libquip
     INSTALL_COMMAND ""
diff --git a/cmake/Modules/Packages/MSCG.cmake b/cmake/Modules/Packages/MSCG.cmake
index 6ac62cb012..cf3d506c82 100644
--- a/cmake/Modules/Packages/MSCG.cmake
+++ b/cmake/Modules/Packages/MSCG.cmake
@@ -12,6 +12,13 @@ if(DOWNLOAD_MSCG)
   mark_as_advanced(MSCG_URL)
   mark_as_advanced(MSCG_MD5)
 
+  # CMake cannot pass BLAS or LAPACK library variable to external project if they are a list
+  list(LENGTH BLAS_LIBRARIES} NUM_BLAS)
+  list(LENGTH LAPACK_LIBRARIES NUM_LAPACK)
+  if((NUM_BLAS GREATER 1) OR (NUM_LAPACK GREATER 1))
+    message(FATAL_ERROR "Cannot compile downloaded MSCG library due to a technical limitation")
+  endif()
+
   include(ExternalProject)
   ExternalProject_Add(mscg_build
     URL     ${MSCG_URL}
diff --git a/cmake/Modules/Packages/SCAFACOS.cmake b/cmake/Modules/Packages/SCAFACOS.cmake
index fd355420c3..de611a1edb 100644
--- a/cmake/Modules/Packages/SCAFACOS.cmake
+++ b/cmake/Modules/Packages/SCAFACOS.cmake
@@ -23,6 +23,11 @@ if(DOWNLOAD_SCAFACOS)
   file(DOWNLOAD ${LAMMPS_THIRDPARTY_URL}/scafacos-1.0.1-fix.diff ${CMAKE_CURRENT_BINARY_DIR}/scafacos-1.0.1.fix.diff
           EXPECTED_HASH MD5=4baa1333bb28fcce102d505e1992d032)
 
+  find_program(HAVE_PATCH patch)
+  if(NOT HAVE_PATCH)
+    message(FATAL_ERROR "The 'patch' program is required to build the ScaFaCoS library")
+  endif()
+
   include(ExternalProject)
   ExternalProject_Add(scafacos_build
     URL     ${SCAFACOS_URL}
diff --git a/cmake/Modules/Packages/VORONOI.cmake b/cmake/Modules/Packages/VORONOI.cmake
index 7feea4c52e..c010469677 100644
--- a/cmake/Modules/Packages/VORONOI.cmake
+++ b/cmake/Modules/Packages/VORONOI.cmake
@@ -26,6 +26,11 @@ if(DOWNLOAD_VORO)
     set(VORO_BUILD_OPTIONS CXX=${CMAKE_CXX_COMPILER} CFLAGS=${VORO_BUILD_CFLAGS})
   endif()
 
+  find_program(HAVE_PATCH patch)
+  if(NOT HAVE_PATCH)
+    message(FATAL_ERROR "The 'patch' program is required to build the voro++ library")
+  endif()
+
   ExternalProject_Add(voro_build
     URL     ${VORO_URL}
     URL_MD5 ${VORO_MD5}
diff --git a/cmake/presets/hip_amd.cmake b/cmake/presets/hip_amd.cmake
new file mode 100644
index 0000000000..4b8945e0c7
--- /dev/null
+++ b/cmake/presets/hip_amd.cmake
@@ -0,0 +1,30 @@
+# preset that will enable hip (clang/clang++) with support for MPI and OpenMP (on Linux boxes)
+
+# prefer flang over gfortran, if available
+find_program(CLANG_FORTRAN NAMES flang gfortran f95)
+set(ENV{OMPI_FC} ${CLANG_FORTRAN})
+
+set(CMAKE_CXX_COMPILER "hipcc" CACHE STRING "" FORCE)
+set(CMAKE_C_COMPILER "hipcc" CACHE STRING "" FORCE)
+set(CMAKE_Fortran_COMPILER ${CLANG_FORTRAN} CACHE STRING "" FORCE)
+set(CMAKE_CXX_FLAGS_DEBUG "-Wall -Wextra -g" CACHE STRING "" FORCE)
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Wall -Wextra -g -O2 -DNDEBUG" CACHE STRING "" FORCE)
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "" FORCE)
+set(CMAKE_Fortran_FLAGS_DEBUG "-Wall -Wextra -g -std=f2003" CACHE STRING "" FORCE)
+set(CMAKE_Fortran_FLAGS_RELWITHDEBINFO "-Wall -Wextra -g -O2 -DNDEBUG -std=f2003" CACHE STRING "" FORCE)
+set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -DNDEBUG -std=f2003" CACHE STRING "" FORCE)
+set(CMAKE_C_FLAGS_DEBUG "-Wall -Wextra -g" CACHE STRING "" FORCE)
+set(CMAKE_C_FLAGS_RELWITHDEBINFO "-Wall -Wextra -g -O2 -DNDEBUG" CACHE STRING "" FORCE)
+set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "" FORCE)
+
+set(MPI_CXX "hipcc" CACHE STRING "" FORCE)
+set(MPI_CXX_COMPILER "mpicxx" CACHE STRING "" FORCE)
+
+unset(HAVE_OMP_H_INCLUDE CACHE)
+set(OpenMP_C "hipcc" CACHE STRING "" FORCE)
+set(OpenMP_C_FLAGS "-fopenmp" CACHE STRING "" FORCE)
+set(OpenMP_C_LIB_NAMES "omp" CACHE STRING "" FORCE)
+set(OpenMP_CXX "hipcc" CACHE STRING "" FORCE)
+set(OpenMP_CXX_FLAGS "-fopenmp" CACHE STRING "" FORCE)
+set(OpenMP_CXX_LIB_NAMES "omp" CACHE STRING "" FORCE)
+set(OpenMP_omp_LIBRARY "libomp.so" CACHE PATH "" FORCE)
diff --git a/doc/lammps.1 b/doc/lammps.1
index fb79b8d774..c868a2a86f 100644
--- a/doc/lammps.1
+++ b/doc/lammps.1
@@ -1,4 +1,4 @@
-.TH LAMMPS "20 September 2021" "2021-09-20"
+.TH LAMMPS "29 September 2021" "2021-09-29"
 .SH NAME
 .B LAMMPS
 \- Molecular Dynamics Simulator.
diff --git a/doc/src/Build_settings.rst b/doc/src/Build_settings.rst
index 074a6349e6..c7397935d8 100644
--- a/doc/src/Build_settings.rst
+++ b/doc/src/Build_settings.rst
@@ -71,7 +71,8 @@ LAMMPS can use them if they are available on your system.
 
          -D FFTW3_INCLUDE_DIR=path   # path to FFTW3 include files
          -D FFTW3_LIBRARY=path       # path to FFTW3 libraries
-         -D FFT_FFTW_THREADS=on      # enable using threaded FFTW3 libraries
+         -D FFTW3_OMP_LIBRARY=path   # path to FFTW3 OpenMP wrapper libraries
+         -D FFT_FFTW_THREADS=on      # enable using OpenMP threaded FFTW3 libraries
          -D MKL_INCLUDE_DIR=path     # ditto for Intel MKL library
          -D FFT_MKL_THREADS=on       # enable using threaded FFTs with MKL libraries
          -D MKL_LIBRARY=path         # path to MKL libraries
diff --git a/doc/src/Developer.rst b/doc/src/Developer.rst
index f54bc4152f..f68007486d 100644
--- a/doc/src/Developer.rst
+++ b/doc/src/Developer.rst
@@ -11,6 +11,7 @@ of time and requests from the LAMMPS user community.
    :maxdepth: 1
 
    Developer_org
+   Developer_parallel
    Developer_flow
    Developer_write
    Developer_notes
diff --git a/doc/src/Developer_par_comm.rst b/doc/src/Developer_par_comm.rst
new file mode 100644
index 0000000000..2e108dda13
--- /dev/null
+++ b/doc/src/Developer_par_comm.rst
@@ -0,0 +1,120 @@
+Communication
+^^^^^^^^^^^^^
+
+Following the partitioning scheme in use all per-atom data is
+distributed across the MPI processes, which allows LAMMPS to handle very
+large systems provided it uses a correspondingly large number of MPI
+processes.  Since The per-atom data (atom IDs, positions, velocities,
+types, etc.)  To be able to compute the short-range interactions MPI
+processes need not only access to data of atoms they "own" but also
+information about atoms from neighboring sub-domains, in LAMMPS referred
+to as "ghost" atoms.  These are copies of atoms storing required
+per-atom data for up to the communication cutoff distance. The green
+dashed-line boxes in the :ref:`domain-decomposition` figure illustrate
+the extended ghost-atom sub-domain for one processor.
+
+This approach is also used to implement periodic boundary
+conditions: atoms that lie within the cutoff distance across a periodic
+boundary are also stored as ghost atoms and taken from the periodic
+replication of the sub-domain, which may be the same sub-domain, e.g. if
+running in serial.  As a consequence of this, force computation in
+LAMMPS is not subject to minimum image conventions and thus cutoffs may
+be larger than half the simulation domain.
+
+.. _ghost-atom-comm:
+.. figure:: img/ghost-comm.png
+   :align: center
+
+   ghost atom communication
+
+   This figure shows the ghost atom communication patterns between
+   sub-domains for "brick" (left) and "tiled" communication styles for
+   2d simulations.  The numbers indicate MPI process ranks.  Here the
+   sub-domains are drawn spatially separated for clarity.  The
+   dashed-line box is the extended sub-domain of processor 0 which
+   includes its ghost atoms.  The red- and blue-shaded boxes are the
+   regions of communicated ghost atoms.
+
+Efficient communication patterns are needed to update the "ghost" atom
+data, since that needs to be done at every MD time step or minimization
+step.  The diagrams of the `ghost-atom-comm` figure illustrate how ghost
+atom communication is performed in two stages for a 2d simulation (three
+in 3d) for both a regular and irregular partitioning of the simulation
+box.  For the regular case (left) atoms are exchanged first in the
+*x*-direction, then in *y*, with four neighbors in the grid of processor
+sub-domains.
+
+In the *x* stage, processor ranks 1 and 2 send owned atoms in their
+red-shaded regions to rank 0 (and vice versa).  Then in the *y* stage,
+ranks 3 and 4 send atoms in their blue-shaded regions to rank 0, which
+includes ghost atoms they received in the *x* stage.  Rank 0 thus
+acquires all its ghost atoms; atoms in the solid blue corner regions
+are communicated twice before rank 0 receives them.
+
+For the irregular case (right) the two stages are similar, but a
+processor can have more than one neighbor in each direction.  In the
+*x* stage, MPI ranks 1,2,3 send owned atoms in their red-shaded regions to
+rank 0 (and vice versa).  These include only atoms between the lower
+and upper *y*-boundary of rank 0's sub-domain.  In the *y* stage, ranks
+4,5,6 send atoms in their blue-shaded regions to rank 0.  This may
+include ghost atoms they received in the *x* stage, but only if they
+are needed by rank 0 to fill its extended ghost atom regions in the
++/-*y* directions (blue rectangles).  Thus in this case, ranks 5 and
+6 do not include ghost atoms they received from each other (in the *x*
+stage) in the atoms they send to rank 0.  The key point is that while
+the pattern of communication is more complex in the irregular
+partitioning case, it can still proceed in two stages (three in 3d)
+via atom exchanges with only neighboring processors.
+
+When attributes of owned atoms are sent to neighboring processors to
+become attributes of their ghost atoms, LAMMPS calls this a "forward"
+communication.  On timesteps when atoms migrate to new owning processors
+and neighbor lists are rebuilt, each processor creates a list of its
+owned atoms which are ghost atoms in each of its neighbor processors.
+These lists are used to pack per-atom coordinates (for example) into
+message buffers in subsequent steps until the next reneighboring.
+
+A "reverse" communication is when computed ghost atom attributes are
+sent back to the processor who owns the atom.  This is used (for
+example) to sum partial forces on ghost atoms to the complete force on
+owned atoms.  The order of the two stages described in the
+:ref:`ghost-atom-comm` figure is inverted and the same lists of atoms
+are used to pack and unpack message buffers with per-atom forces.  When
+a received buffer is unpacked, the ghost forces are summed to owned atom
+forces.  As in forward communication, forces on atoms in the four blue
+corners of the diagrams are sent, received, and summed twice (once at
+each stage) before owning processors have the full force.
+
+These two operations are used many places within LAMMPS aside from
+exchange of coordinates and forces, for example by manybody potentials
+to share intermediate per-atom values, or by rigid-body integrators to
+enable each atom in a body to access body properties.  Here are
+additional details about how these communication operations are
+performed in LAMMPS:
+
+- When exchanging data with different processors, forward and reverse
+  communication is done using ``MPI_Send()`` and ``MPI_IRecv()`` calls.
+  If a processor is "exchanging" atoms with itself, only the pack and
+  unpack operations are performed, e.g. to create ghost atoms across
+  periodic boundaries when running on a single processor.
+
+- For forward communication of owned atom coordinates, periodic box
+  lengths are added and subtracted when the receiving processor is
+  across a periodic boundary from the sender.  There is then no need to
+  apply a minimum image convention when calculating distances between
+  atom pairs when building neighbor lists or computing forces.
+
+- The cutoff distance for exchanging ghost atoms is typically equal to
+  the neighbor cutoff.  But it can also chosen to be longer if needed,
+  e.g. half the diameter of a rigid body composed of multiple atoms or
+  over 3x the length of a stretched bond for dihedral interactions.  It
+  can also exceed the periodic box size.  For the regular communication
+  pattern (left), if the cutoff distance extends beyond a neighbor
+  processor's sub-domain, then multiple exchanges are performed in the
+  same direction.  Each exchange is with the same neighbor processor,
+  but buffers are packed/unpacked using a different list of atoms. For
+  forward communication, in the first exchange a processor sends only
+  owned atoms.  In subsequent exchanges, it sends ghost atoms received
+  in previous exchanges.  For the irregular pattern (right) overlaps of
+  a processor's extended ghost-atom sub-domain with all other processors
+  in each dimension are detected.
diff --git a/doc/src/Developer_par_long.rst b/doc/src/Developer_par_long.rst
new file mode 100644
index 0000000000..f297cf3fa6
--- /dev/null
+++ b/doc/src/Developer_par_long.rst
@@ -0,0 +1,188 @@
+Long-range interactions
+^^^^^^^^^^^^^^^^^^^^^^^
+
+For charged systems, LAMMPS can compute long-range Coulombic
+interactions via the FFT-based particle-particle/particle-mesh (PPPM)
+method implemented in :doc:`kspace style pppm and its variants
+<kspace_style>`.  For that Coulombic interactions are partitioned into
+short- and long-range components.  The short-ranged portion is computed
+in real space as a loop over pairs of charges within a cutoff distance,
+using neighbor lists.  The long-range portion is computed in reciprocal
+space using a kspace style.  For the PPPM implementation the simulation
+cell is overlaid with a regular FFT grid in 3d. It proceeds in several stages:
+
+a) each atom's point charge is interpolated to nearby FFT grid points,
+b) a forward 3d FFT is performed,
+c) a convolution operation is performed in reciprocal space,
+d) one or more inverse 3d FFTs are performed, and
+e) electric field values from grid points near each atom are interpolated to compute
+   its forces.
+
+For any of the spatial-decomposition partitioning schemes each processor
+owns the brick-shaped portion of FFT grid points contained within its
+sub-domain.  The two interpolation operations use a stencil of grid
+points surrounding each atom.  To accommodate the stencil size, each
+processor also stores a few layers of ghost grid points surrounding its
+brick.  Forward and reverse communication of grid point values is
+performed similar to the corresponding :doc:`atom data communication
+<Developer_par_comm>`.  In this case, electric field values on owned
+grid points are sent to neighboring processors to become ghost point
+values.  Likewise charge values on ghost points are sent and summed to
+values on owned points.
+
+For triclinic simulation boxes, the FFT grid planes are parallel to
+the box faces, but the mapping of charge and electric field values
+to/from grid points is done in reduced coordinates where the tilted
+box is conceptually a unit cube, so that the stencil and FFT
+operations are unchanged.  However the FFT grid size required for a
+given accuracy is larger for triclinic domains than it is for
+orthogonal boxes.
+
+.. _fft-parallel:
+.. figure:: img/fft-decomp-parallel.png
+   :align: center
+
+   parallel FFT in PPPM
+
+   Stages of a parallel FFT for a simulation domain overlaid
+   with an 8x8x8 3d FFT grid, partitioned across 64 processors.
+   Within each of the 4 diagrams, grid cells of the same color are
+   owned by a single processor; for simplicity only cells owned by 4
+   or 8 of the 64 processors are colored.  The two images on the left
+   illustrate brick-to-pencil communication.  The two images on the
+   right illustrate pencil-to-pencil communication, which in this
+   case transposes the *y* and *z* dimensions of the grid.
+
+Parallel 3d FFTs require substantial communication relative to their
+computational cost.  A 3d FFT is implemented by a series of 1d FFTs
+along the *x-*, *y-*, and *z-*\ direction of the FFT grid.  Thus the FFT
+grid cannot be decomposed like atoms into 3 dimensions for parallel
+processing of the FFTs but only in 1 (as planes) or 2 (as pencils)
+dimensions and in between the steps the grid needs to be transposed to
+have the FFT grid portion "owned" by each MPI process complete in the
+direction of the 1d FFTs it has to perform. LAMMPS uses the
+pencil-decomposition algorithm as shown in the :ref:`fft-parallel` figure.
+
+Initially (far left), each processor owns a brick of same-color grid
+cells (actually grid points) contained within in its sub-domain.  A
+brick-to-pencil communication operation converts this layout to 1d
+pencils in the *x*-dimension (center left).  Again, cells of the same
+color are owned by the same processor.  Each processor can then compute
+a 1d FFT on each pencil of data it wholly owns using a call to the
+configured FFT library.  A pencil-to-pencil communication then converts
+this layout to pencils in the *y* dimension (center right) which
+effectively transposes the *x* and *y* dimensions of the grid, followed
+by 1d FFTs in *y*.  A final transpose of pencils from *y* to *z* (far
+right) followed by 1d FFTs in *z* completes the forward FFT.  The data
+is left in a *z*-pencil layout for the convolution operation.  One or
+more inverse FFTs then perform the sequence of 1d FFTs and communication
+steps in reverse order; the final layout of resulting grid values is the
+same as the initial brick layout.
+
+Each communication operation within the FFT (brick-to-pencil or
+pencil-to-pencil or pencil-to-brick) converts one tiling of the 3d grid
+to another, where a tiling in this context means an assignment of a
+small brick-shaped subset of grid points to each processor, the union of
+which comprise the entire grid.  The parallel `fftMPI library
+<https://lammps.github.io/fftmpi/>`_ written for LAMMPS allows arbitrary
+definitions of the tiling so that an irregular partitioning of the
+simulation domain can use it directly.  Transforming data from one
+tiling to another is implemented in `fftMPI` using point-to-point
+communication, where each processor sends data to a few other
+processors, since each tile in the initial tiling overlaps with a
+handful of tiles in the final tiling.
+
+The transformations could also be done using collective communication
+across all $P$ processors with a single call to ``MPI_Alltoall()``, but
+this is typically much slower.  However, for the specialized brick and
+pencil tiling illustrated in :ref:`fft-parallel` figure, collective
+communication across the entire MPI communicator is not required.  In
+the example an :math:`8^3` grid with 512 grid cells is partitioned
+across 64 processors; each processor owns a 2x2x2 3d brick of grid
+cells.  The initial brick-to-pencil communication (upper left to upper
+right) only requires collective communication within subgroups of 4
+processors, as illustrated by the 4 colors.  More generally, a
+brick-to-pencil communication can be performed by partitioning *P*
+processors into :math:`P^{\frac{2}{3}}` subgroups of
+:math:`P^{\frac{1}{3}}` processors each.  Each subgroup performs
+collective communication only within its subgroup.  Similarly,
+pencil-to-pencil communication can be performed by partitioning *P*
+processors into :math:`P^{\frac{1}{2}}` subgroups of
+:math:`P^{\frac{1}{2}}` processors each.  This is illustrated in the
+figure for the :math:`y \Rightarrow z` communication (center).  An
+eight-processor subgroup owns the front *yz* plane of data and performs
+collective communication within the subgroup to transpose from a
+*y*-pencil to *z*-pencil layout.
+
+LAMMPS invokes point-to-point communication by default, but also
+provides the option of partitioned collective communication when using a
+:doc:`kspace_modify collective yes <kspace_modify>` command to switch to
+that mode.  In the latter case, the code detects the size of the
+disjoint subgroups and partitions the single *P*-size communicator into
+multiple smaller communicators, each of which invokes collective
+communication.  Testing on a large IBM Blue Gene/Q machine at Argonne
+National Labs showed a significant improvement in FFT performance for
+large processor counts; partitioned collective communication was faster
+than point-to-point communication or global collective communication
+involving all *P* processors.
+
+Here are some additional details about FFTs for long-range and related
+grid/particle operations that LAMMPS supports:
+
+- The fftMPI library allows each grid dimension to be a multiple of
+  small prime factors (2,3,5), and allows any number of processors to
+  perform the FFT.  The resulting brick and pencil decompositions are
+  thus not always as well-aligned but the size of subgroups of
+  processors for the two modes of communication (brick/pencil and
+  pencil/pencil) still scale as :math:`O(P^{\frac{1}{3}})` and
+  :math:`O(P^{\frac{1}{2}})`.
+
+- For efficiency in performing 1d FFTs, the grid transpose
+  operations illustrated in Figure \ref{fig:fft} also involve
+  reordering the 3d data so that a different dimension is contiguous
+  in memory.  This reordering can be done during the packing or
+  unpacking of buffers for MPI communication.
+
+- For large systems and particularly a large number of MPI processes,
+  the dominant cost for parallel FFTs is often the communication, not
+  the computation of 1d FFTs, even though the latter scales as :math:`N
+  \log(N)` in the number of grid points *N* per grid direction.  This is
+  due to the fact that only a 2d decomposition into pencils is possible
+  while atom data (and their corresponding short-range force and energy
+  computations) can be decomposed efficiently in 3d.
+
+  This can be addressed by reducing the number of MPI processes involved
+  in the MPI communication by using :doc:`hybrid MPI + OpenMP
+  parallelization <Speed_omp>`.  This will use OpenMP parallelization
+  inside the MPI domains and while that may have a lower parallel
+  efficiency, it reduces the communication overhead.
+
+  As an alternative it is also possible to start a :ref:`multi-partition
+  <partition>` calculation and then use the :doc:`verlet/split
+  integrator <run_style>` to perform the PPPM computation on a
+  dedicated, separate partition of MPI processes.  This uses an integer
+  "1:*p*" mapping of *p* sub-domains of the atom decomposition to one
+  sub-domain of the FFT grid decomposition and where pairwise non-bonded
+  and bonded forces and energies are computed on the larger partition
+  and the PPPM kspace computation concurrently on the smaller partition.
+
+- LAMMPS also implements PPPM-based solvers for other long-range
+  interactions, dipole and dispersion (Lennard-Jones), which can be used
+  in conjunction with long-range  Coulombics for point charges.
+
+- LAMMPS implements a ``GridComm`` class which overlays the simulation
+  domain with a regular grid, partitions it across processors in a
+  manner consistent with processor sub-domains, and provides methods for
+  forward and reverse communication of owned and ghost grid point
+  values.  It is used for PPPM as an FFT grid (as outlined above) and
+  also for the MSM algorithm which uses a cascade of grid sizes from
+  fine to coarse to compute long-range Coulombic forces.  The GridComm
+  class is also useful for models where continuum fields interact with
+  particles.  For example, the two-temperature model (TTM) defines heat
+  transfer between atoms (particles) and electrons (continuum gas) where
+  spatial variations in the electron temperature are computed by finite
+  differences of a discretized heat equation on a regular grid.  The
+  :doc:`fix ttm/grid <fix_ttm>` command uses the ``GridComm`` class
+  internally to perform its grid operations on a distributed grid
+  instead of the original :doc:`fix ttm <fix_ttm>` which uses a
+  replicated grid.
diff --git a/doc/src/Developer_par_neigh.rst b/doc/src/Developer_par_neigh.rst
new file mode 100644
index 0000000000..4b286d77d8
--- /dev/null
+++ b/doc/src/Developer_par_neigh.rst
@@ -0,0 +1,159 @@
+Neighbor lists
+^^^^^^^^^^^^^^
+
+To compute forces efficiently, each processor creates a Verlet-style
+neighbor list which enumerates all pairs of atoms *i,j* (*i* = owned,
+*j* = owned or ghost) with separation less than the applicable
+neighbor list cutoff distance.  In LAMMPS the neighbor lists are stored
+in a multiple-page data structure; each page is a contiguous chunk of
+memory which stores vectors of neighbor atoms *j* for many *i* atoms.
+This allows pages to be incrementally allocated or deallocated in blocks
+as needed.  Neighbor lists typically consume the most memory of any data
+structure in LAMMPS.  The neighbor list is rebuilt (from scratch) once
+every few timesteps, then used repeatedly each step for force or other
+computations.  The neighbor cutoff distance is :math:`R_n = R_f +
+\Delta_s`, where :math:`R_f` is the (largest) force cutoff defined by
+the interatomic potential for computing short-range pairwise or manybody
+forces and :math:`\Delta_s` is a "skin" distance that allows the list to
+be used for multiple steps assuming that atoms do not move very far
+between consecutive time steps.  Typically the code triggers
+reneighboring when any atom has moved half the skin distance since the
+last reneighboring; this and other options of the neighbor list rebuild
+can be adjusted with the :doc:`neigh_modify <neigh_modify>` command.
+
+On steps when reneighboring is performed, atoms which have moved outside
+their owning processor's sub-domain are first migrated to new processors
+via communication.  Periodic boundary conditions are also (only)
+enforced on these steps to ensure each atom is re-assigned to the
+correct processor.  After migration, the atoms owned by each processor
+are stored in a contiguous vector.  Periodically each processor
+spatially sorts owned atoms within its vector to reorder it for improved
+cache efficiency in force computations and neighbor list building.  For
+that atoms are spatially binned and then reordered so that atoms in the
+same bin are adjacent in the vector.  Atom sorting can be disabled or
+its settings modified with the :doc:`atom_modify <atom_modify>` command.
+
+.. _neighbor-stencil:
+.. figure:: img/neigh-stencil.png
+   :align: center
+
+   neighbor list stencils
+
+   A 2d simulation sub-domain (thick black line) and the corresponding
+   ghost atom cutoff region (dashed blue line) for both orthogonal
+   (left) and triclinic (right) domains.  A regular grid of neighbor
+   bins (thin lines) overlays the entire simulation domain and need not
+   align with sub-domain boundaries; only the portion overlapping the
+   augmented sub-domain is shown.  In the triclinic case it overlaps the
+   bounding box of the tilted rectangle.  The blue- and red-shaded bins
+   represent a stencil of bins searched to find neighbors of a particular
+   atom (black dot).
+
+To build a local neighbor list in linear time, the simulation domain is
+overlaid (conceptually) with a regular 3d (or 2d) grid of neighbor bins,
+as shown in the :ref:`neighbor-stencil` figure for 2d models and a
+single MPI processor's sub-domain.  Each processor stores a set of
+neighbor bins which overlap its sub-domain extended by the neighbor
+cutoff distance :math:`R_n`.  As illustrated, the bins need not align
+with processor boundaries; an integer number in each dimension is fit to
+the size of the entire simulation box.
+
+Most often LAMMPS builds what it calls a "half" neighbor list where
+each *i,j* neighbor pair is stored only once, with either atom *i* or
+*j* as the central atom.  The build can be done efficiently by using a
+pre-computed "stencil" of bins around a central origin bin which
+contains the atom whose neighbors are being searched for.  A stencil
+is simply a list of integer offsets in *x,y,z* of nearby bins
+surrounding the origin bin which are close enough to contain any
+neighbor atom *j* within a distance :math:`R_n` from any atom *i* in the
+origin bin.  Note that for a half neighbor list, the stencil can be
+asymmetric since each atom only need store half its nearby neighbors.
+
+These stencils are illustrated in the figure for a half list and a bin
+size of :math:`\frac{1}{2} R_n`.  There are 13 red+blue stencil bins in
+2d (for the orthogonal case, 15 for triclinic).  In 3d there would be
+63, 13 in the plane of bins that contain the origin bin and 25 in each
+of the two planes above it in the *z* direction (75 for triclinic).  The
+reason the triclinic stencil has extra bins is because the bins tile the
+bounding box of the entire triclinic domain and thus are not periodic
+with respect to the simulation box itself.  The stencil and logic for
+determining which *i,j* pairs to include in the neighbor list are
+altered slightly to account for this.
+
+To build a neighbor list, a processor first loops over its "owned" plus
+"ghost" atoms and assigns each to a neighbor bin.  This uses an integer
+vector to create a linked list of atom indices within each bin.  It then
+performs a triply-nested loop over its owned atoms *i*, the stencil of
+bins surrounding atom *i*'s bin, and the *j* atoms in each stencil bin
+(including ghost atoms).  If the distance :math:`r_{ij} < R_n`, then
+atom *j* is added to the vector of atom *i*'s neighbors.
+
+Here are additional details about neighbor list build options LAMMPS
+supports:
+
+- The choice of bin size is an option; a size half of :math:`R_n` has
+  been found to be optimal for many typical cases.  Smaller bins incur
+  additional overhead to loop over; larger bins require more distance
+  calculations.  Note that for smaller bin sizes, the 2d stencil in the
+  figure would be more semi-circular in shape (hemispherical in 3d),
+  with bins near the corners of the square eliminated due to their
+  distance from the origin bin.
+
+- Depending on the interatomic potential(s) and other commands used in
+  an input script, multiple neighbor lists and stencils with different
+  attributes may be needed.  This includes lists with different cutoff
+  distances, e.g. for force computation versus occasional diagnostic
+  computations such as a radial distribution function, or for the
+  r-RESPA time integrator which can partition pairwise forces by
+  distance into subsets computed at different time intervals.  It
+  includes "full" lists (as opposed to half lists) where each *i,j* pair
+  appears twice, stored once with *i* and *j*, and which use a larger
+  symmetric stencil.  It also includes lists with partial enumeration of
+  ghost atom neighbors.  The full and ghost-atom lists are used by
+  various manybody interatomic potentials.  Lists may also use different
+  criteria for inclusion of a pair interaction.  Typically this simply
+  depends only on the distance between two atoms and the cutoff
+  distance.  But for finite-size coarse-grained particles with
+  individual diameters (e.g. polydisperse granular particles), it can
+  also depend on the diameters of the two particles.
+
+- When using :doc:`pair style hybrid <pair_hybrid>` multiple sub-lists
+  of the master neighbor list for the full system need to be generated,
+  one for each sub-style, which contains only the *i,j* pairs needed to
+  compute interactions between subsets of atoms for the corresponding
+  potential.  This means not all *i* or *j* atoms owned by a processor
+  are included in a particular sub-list.
+
+- Some models use different cutoff lengths for pairwise interactions
+  between different kinds of particles which are stored in a single
+  neighbor list.  One example is a solvated colloidal system with large
+  colloidal particles where colloid/colloid, colloid/solvent, and
+  solvent/solvent interaction cutoffs can be dramatically different.
+  Another is a model of polydisperse finite-size granular particles;
+  pairs of particles interact only when they are in contact with each
+  other.  Mixtures with particle size ratios as high as 10-100x may be
+  used to model realistic systems.  Efficient neighbor list building
+  algorithms for these kinds of systems are available in LAMMPS.  They
+  include a method which uses different stencils for different cutoff
+  lengths and trims the stencil to only include bins that straddle the
+  cutoff sphere surface.  More recently a method which uses both
+  multiple stencils and multiple bin sizes was developed; it builds
+  neighbor lists efficiently for systems with particles of any size
+  ratio, though other considerations (timestep size, force computations)
+  may limit the ability to model systems with huge polydispersity.
+
+- For small and sparse systems and as a fallback method, LAMMPS also
+  supports neighbor list construction without binning by using a full
+  :math:`O(N^2)` loop over all *i,j* atom pairs in a sub-domain when
+  using the :doc:`neighbor nsq <neighbor>` command.
+
+- Dependent on the "pair" setting of the :doc:`newton <newton>` command,
+  the "half" neighbor lists may contain **all** pairs of atoms where
+  atom *j* is a ghost atom (i.e. when the newton pair setting is *off*)
+  For the newton pair *on* setting the atom *j* is only added to the
+  list if its *z* coordinate is larger, or if equal the *y* coordinate
+  is larger, and that is equal, too, the *x* coordinate is larger.  For
+  homogeneously dense systems that will result in picking neighbors from
+  a same size sector in always the same direction relative to the
+  "owned" atom and thus it should lead to similar length neighbor lists
+  and thus reduce the chance of a load imbalance.
diff --git a/doc/src/Developer_par_openmp.rst b/doc/src/Developer_par_openmp.rst
new file mode 100644
index 0000000000..91c649a7b8
--- /dev/null
+++ b/doc/src/Developer_par_openmp.rst
@@ -0,0 +1,114 @@
+OpenMP Parallelism
+^^^^^^^^^^^^^^^^^^
+
+The styles in the INTEL, KOKKOS, and OPENMP package offer to use OpenMP
+thread parallelism to predominantly distribute loops over local data
+and thus follow an orthogonal parallelization strategy to the
+decomposition into spatial domains used by the :doc:`MPI partitioning
+<Developer_par_part>`.  For clarity, this section discusses only the
+implementation in the OPENMP package as it is the simplest. The INTEL
+and KOKKOS package offer additional options and are more complex since
+they support more features and different hardware like co-processors
+or GPUs.
+
+One of the key decisions when implementing the OPENMP package was to
+keep the changes to the source code small, so that it would be easier to
+maintain the code and keep it in sync with the non-threaded standard
+implementation.  this is achieved by a) making the OPENMP version a
+derived class from the regular version (e.g. ``PairLJCutOMP`` from
+``PairLJCut``) and overriding only methods that are multi-threaded or
+need to be modified to support multi-threading (similar to what was done
+in the OPT package), b) keeping the structure in the modified code very
+similar so that side-by-side comparisons are still useful, and c)
+offloading additional functionality and multi-thread support functions
+into three separate classes ``ThrOMP``, ``ThrData``, and ``FixOMP``.
+``ThrOMP`` provides additional, multi-thread aware functionality not
+available in the corresponding base class (e.g. ``Pair`` for
+``PairLJCutOMP``) like multi-thread aware variants of the "tally"
+functions. Those functions are made available through multiple
+inheritance so those new functions have to have unique names to avoid
+ambiguities; typically ``_thr`` is appended to the name of the function.
+``ThrData`` is a classes that manages per-thread data structures.
+It is used instead of extending the corresponding storage to per-thread
+arrays to avoid slowdowns due to "false sharing" when multiple threads
+update adjacent elements in an array and thus force the CPU cache lines
+to be reset and re-fetched.  ``FixOMP`` finally manages the "multi-thread
+state" like settings and access to per-thread storage, it is activated
+by the :doc:`package omp <package>` command.
+
+Avoiding data races
+"""""""""""""""""""
+
+A key problem when implementing thread parallelism in an MD code is
+to avoid data races when updating accumulated properties like forces,
+energies, and stresses.  When interactions are computed, they always
+involve multiple atoms and thus there are race conditions when multiple
+threads want to update per-atom data of the same atoms.  Five possible
+strategies have been considered to avoid this:
+
+1) restructure the code so that there is no overlapping access possible
+   when computing in parallel, e.g. by breaking lists into multiple
+   parts and synchronizing threads in between.
+2) have each thread be "responsible" for a specific group of atoms and
+   compute these interactions multiple times, once on each thread that
+   is responsible for a given atom and then have each thread only update
+   the properties of this atom.
+3) use mutexes around functions and regions of code where the data race
+   could happen
+4) use atomic operations when updating per-atom properties
+5) use replicated per-thread data structures to accumulate data without
+   conflicts and then use a reduction to combine those results into the
+   data structures used by the regular style.
+
+Option 5 was chosen for the OPENMP package because it would retain the
+performance for the case of 1 thread and the code would be more
+maintainable.  Option 1 would require extensive code changes,
+particularly to the neighbor list code; options 2 would have incurred a
+2x or more performance penalty for the serial case; option 3 causes
+significant overhead and would enforce serialization of operations in
+inner loops and thus defeat the purpose of multi-threading; option 4
+slows down the serial case although not quite as bad as option 2.  The
+downside of option 5 is that the overhead of the reduction operations
+grows with the number of threads used, so there would be a crossover
+point where options 2 or 4 would result in faster executing.  That is
+why option 2 for example is used in the GPU package because a GPU is a
+processor with a massive number of threads.  However, since the MPI
+parallelization is generally more effective for typical MD systems, the
+expectation is that thread parallelism is only used for a smaller number
+of threads (2-8).  At the time of its implementation, that number was
+equivalent to the number of CPU cores per CPU socket on high-end
+supercomputers.
+
+Thus arrays like the force array are dimensioned to the number of atoms
+times the number of threads when enabling OpenMP support and inside the
+compute functions a pointer to a different chunk is obtained by each thread.
+Similarly, accumulators like potential energy or virial are kept in
+per-thread instances of the ``ThrData`` class and then only reduced and
+stored in their global counterparts at the end of the force computation.
+
+
+Loop scheduling
+"""""""""""""""
+
+Multi-thread parallelization is applied by distributing (outer) loops
+statically across threads.  Typically this would be the loop over local
+atoms *i* when processing *i,j* pairs of atoms from a neighbor list.
+The design of the neighbor list code results in atoms having a similar
+number of neighbors for homogeneous systems and thus load imbalances
+across threads are not common and typically happen for systems where
+also the MPI parallelization would be unbalanced, which would typically
+have a more pronounced impact on the performance.  This same loop
+scheduling scheme can also be applied to the reduction operations on
+per-atom data to try and reduce the overhead of the reduction operation.
+
+Neighbor list parallelization
+"""""""""""""""""""""""""""""
+
+In addition to the parallelization of force computations, also the
+generation of the neighbor lists is parallelized.  As explained
+previously, neighbor lists are built by looping over "owned" atoms and
+storing the neighbors in "pages".  In the OPENMP variants of the
+neighbor list code, each thread operates on a different chunk of "owned"
+atoms and allocates and fills its own set of pages with neighbor list
+data.  This is achieved by each thread keeping its own instance of the
+:cpp:class:`MyPage <LAMMPS_NS::MyPage>` page allocator class.
diff --git a/doc/src/Developer_par_part.rst b/doc/src/Developer_par_part.rst
new file mode 100644
index 0000000000..f797f559e2
--- /dev/null
+++ b/doc/src/Developer_par_part.rst
@@ -0,0 +1,89 @@
+Partitioning
+^^^^^^^^^^^^
+
+The underlying spatial decomposition strategy used by LAMMPS for
+distributed-memory parallelism is set with the :doc:`comm_style command
+<comm_style>` and can be either "brick" (a regular grid) or "tiled".
+
+.. _domain-decomposition:
+.. figure:: img/domain-decomp.png
+   :align: center
+
+   domain decomposition
+
+   This figure shows the different kinds of domain decomposition used
+   for MPI parallelization: "brick" on the left with an orthogonal
+   (left) and a triclinic (middle) simulation domain, and a "tiled"
+   decomposition (right).  The black lines show the division into
+   sub-domains and the contained atoms are "owned" by the corresponding
+   MPI process. The green dashed lines indicate how sub-domains are
+   extended with "ghost" atoms up to the communication cutoff distance.
+
+The LAMMPS simulation box is a 3d or 2d volume, which can be orthogonal
+or triclinic in shape, as illustrated in the :ref:`domain-decomposition`
+figure for the 2d case.  Orthogonal means the box edges are aligned with
+the *x*, *y*, *z* Cartesian axes, and the box faces are thus all
+rectangular.  Triclinic allows for a more general parallelepiped shape
+in which edges are aligned with three arbitrary vectors and the box
+faces are parallelograms.  In each dimension box faces can be periodic,
+or non-periodic with fixed or shrink-wrapped boundaries.  In the fixed
+case, atoms which move outside the face are deleted; shrink-wrapped
+means the position of the box face adjusts continuously to enclose all
+the atoms.
+
+For distributed-memory MPI parallelism, the simulation box is spatially
+decomposed (partitioned) into non-overlapping sub-domains which fill the
+box. The default partitioning, "brick", is most suitable when atom
+density is roughly uniform, as shown in the left-side images of the
+:ref:`domain-decomposition` figure.  The sub-domains comprise a regular
+grid and all sub-domains are identical in size and shape.  Both the
+orthogonal and triclinic boxes can deform continuously during a
+simulation, e.g. to compress a solid or shear a liquid, in which case
+the processor sub-domains likewise deform.
+
+
+For models with non-uniform density, the number of particles per
+processor can be load-imbalanced with the default partitioning.  This
+reduces parallel efficiency, as the overall simulation rate is limited
+by the slowest processor, i.e. the one with the largest computational
+load.  For such models, LAMMPS supports multiple strategies to reduce
+the load imbalance:
+
+- The processor grid decomposition is by default based on the simulation
+  cell volume and tries to optimize the volume to surface ratio for the sub-domains.
+  This can be changed with the :doc:`processors command <processors>`.
+- The parallel planes defining the size of the sub-domains can be shifted
+  with the :doc:`balance command <balance>`. Which can be done in addition
+  to choosing a more optimal processor grid.
+- The recursive bisectioning algorithm in combination with the "tiled"
+  communication style can produce a partitioning with equal numbers of
+  particles in each sub-domain.
+
+
+.. |decomp1| image:: img/decomp-regular.png
+   :width: 24%
+
+.. |decomp2| image:: img/decomp-processors.png
+   :width: 24%
+
+.. |decomp3| image:: img/decomp-balance.png
+   :width: 24%
+
+.. |decomp4| image:: img/decomp-rcb.png
+   :width: 24%
+
+|decomp1|  |decomp2|  |decomp3|  |decomp4|
+
+The pictures above demonstrate different decompositions for a 2d system
+with 12 MPI ranks.  The atom colors indicate the load imbalance of each
+sub-domain with green being optimal and red the least optimal.
+
+Due to the vacuum in the system, the default decomposition is unbalanced
+with several MPI ranks without atoms (left). By forcing a 1x12x1
+processor grid, every MPI rank does computations now, but number of
+atoms per sub-domain is still uneven and the thin slice shape increases
+the amount of communication between sub-domains (center left). With a
+2x6x1 processor grid and shifting the sub-domain divisions, the load
+imbalance is further reduced and the amount of communication required
+between sub-domains is less (center right).  And using the recursive
+bisectioning leads to further improved decomposition (right).
diff --git a/doc/src/Developer_parallel.rst b/doc/src/Developer_parallel.rst
new file mode 100644
index 0000000000..c7bfcfca9e
--- /dev/null
+++ b/doc/src/Developer_parallel.rst
@@ -0,0 +1,28 @@
+Parallel algorithms
+-------------------
+
+LAMMPS is designed to enable running simulations in parallel using the
+MPI parallel communication standard with distributed data via domain
+decomposition.  The parallelization aims to be efficient result in good
+strong scaling (= good speedup for the same system) and good weak
+scaling (= the computational cost of enlarging the system is
+proportional to the system size).  Additional parallelization using GPUs
+or OpenMP can also be applied within the sub-domain assigned to an MPI
+process.  For clarity, most of the following illustrations show the 2d
+simulation case. The underlying algorithms in those cases, however,
+apply to both 2d and 3d cases equally well.
+
+.. note::
+
+   The text and most of the figures in this chapter were adapted
+   for the manual from the section on parallel algorithms in the
+   :ref:`new LAMMPS paper <lammps_paper>`.
+
+.. toctree::
+   :maxdepth: 1
+
+   Developer_par_part
+   Developer_par_comm
+   Developer_par_neigh
+   Developer_par_long
+   Developer_par_openmp
diff --git a/doc/src/Intro_citing.rst b/doc/src/Intro_citing.rst
index 978def9f15..0e10b7559a 100644
--- a/doc/src/Intro_citing.rst
+++ b/doc/src/Intro_citing.rst
@@ -4,28 +4,41 @@ Citing LAMMPS
 Core Algorithms
 ^^^^^^^^^^^^^^^
 
-Since LAMMPS is a community project, there is not a single one
-publication or reference that describes **all** of LAMMPS.
-The canonical publication that describes the foundation, that is
-the basic spatial decomposition approach, the neighbor finding,
-and basic communications algorithms used in LAMMPS is:
+The paper mentioned below is the best overview of LAMMPS, but there are
+also publications describing particular models or algorithms implemented
+in LAMMPS or complementary software that is has interfaces to.  Please
+see below for how to cite contributions to LAMMPS.
 
- `S. Plimpton, Fast Parallel Algorithms for Short-Range Molecular Dynamics, J Comp Phys, 117, 1-19 (1995). <http://www.sandia.gov/~sjplimp/papers/jcompphys95.pdf>`_
+.. _lammps_paper:
 
-So any project using LAMMPS (or a derivative application using LAMMPS as
-a simulation engine) should cite this paper. A new publication
-describing the developments and improvements of LAMMPS in the 25 years
-since then is currently in preparation.
+The latest canonical publication that describes the basic features, the
+source code design, the program structure, the spatial decomposition
+approach, the neighbor finding, basic communications algorithms, and how
+users and developers have contributed to LAMMPS is:
+
+  `LAMMPS - A flexible simulation tool for particle-based materials modeling at the atomic, meso, and continuum scales, Comp. Phys. Comm. (accepted 09/2021), DOI:10.1016/j.cpc.2021.108171 <https://doi.org/10.1016/j.cpc.2021.108171>`_
+
+So a project using LAMMPS or a derivative application that uses LAMMPS
+as a simulation engine should cite this paper.  The paper is expected to
+be published in its final form under the same DOI in the first half
+of 2022.  Please also give the URL of the LAMMPS website in your paper,
+namely https://www.lammps.org.
+
+The original publication describing the parallel algorithms used in the
+initial versions of LAMMPS is:
+
+  `S. Plimpton, Fast Parallel Algorithms for Short-Range Molecular Dynamics, J Comp Phys, 117, 1-19 (1995). <http://www.sandia.gov/~sjplimp/papers/jcompphys95.pdf>`_
 
 
 DOI for the LAMMPS code
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-LAMMPS developers use the `Zenodo service at CERN
-<https://zenodo.org/>`_ to create digital object identifies (DOI) for
-stable releases of the LAMMPS code. There are two types of DOIs for the
-LAMMPS source code: the canonical DOI for **all** versions of LAMMPS,
-which will always point to the **latest** stable release version is:
+LAMMPS developers use the `Zenodo service at CERN <https://zenodo.org/>`_
+to create digital object identifies (DOI) for stable releases of the
+LAMMPS source code. There are two types of DOIs for the LAMMPS source code.
+
+The canonical DOI for **all** versions of LAMMPS, which will always
+point to the **latest** stable release version is:
 
 - DOI: `10.5281/zenodo.3726416 <https://dx.doi.org/10.5281/zenodo.3726416>`_
 
@@ -45,11 +58,13 @@ about LAMMPS and its features.
 Citing contributions
 ^^^^^^^^^^^^^^^^^^^^
 
-LAMMPS has many features and that use either previously published
-methods and algorithms or novel features.  It also includes potential
-parameter filed for specific models.  Where available, a reminder about
-references for optional features used in a specific run is printed to
-the screen and log file.  Style and output location can be selected with
-the :ref:`-cite command-line switch <cite>`.  Additional references are
+LAMMPS has many features that use either previously published methods
+and algorithms or novel features.  It also includes potential parameter
+files for specific models.  Where available, a reminder about references
+for optional features used in a specific run is printed to the screen
+and log file.  Style and output location can be selected with the
+:ref:`-cite command-line switch <cite>`.  Additional references are
 given in the documentation of the :doc:`corresponding commands
-<Commands_all>` or in the :doc:`Howto tutorials <Howto>`.
+<Commands_all>` or in the :doc:`Howto tutorials <Howto>`.  So please
+make certain, that you provide the proper acknowledgments and citations
+in any published works using LAMMPS.
diff --git a/doc/src/Library_create.rst b/doc/src/Library_create.rst
index 3566cb3cc9..8043819891 100644
--- a/doc/src/Library_create.rst
+++ b/doc/src/Library_create.rst
@@ -34,7 +34,7 @@ simple example demonstrating its use:
      int lmpargc = sizeof(lmpargv)/sizeof(const char *);
 
      /* create LAMMPS instance */
-     handle = lammps_open_no_mpi(lmpargc, lmpargv, NULL);
+     handle = lammps_open_no_mpi(lmpargc, (char **)lmpargv, NULL);
      if (handle == NULL) {
        printf("LAMMPS initialization failed");
        lammps_mpi_finalize();
diff --git a/doc/src/PDF/colvars-refman-lammps.pdf b/doc/src/PDF/colvars-refman-lammps.pdf
index 8b6e5bffdc..011b3d0f04 100644
Binary files a/doc/src/PDF/colvars-refman-lammps.pdf and b/doc/src/PDF/colvars-refman-lammps.pdf differ
diff --git a/doc/src/Run_basics.rst b/doc/src/Run_basics.rst
index 3d57633df2..5f1211d093 100644
--- a/doc/src/Run_basics.rst
+++ b/doc/src/Run_basics.rst
@@ -2,17 +2,25 @@ Basics of running LAMMPS
 ========================
 
 LAMMPS is run from the command line, reading commands from a file via
-the -in command line flag, or from standard input.
-Using the "-in in.file" variant is recommended:
+the -in command line flag, or from standard input.  Using the "-in
+in.file" variant is recommended (see note below).  The name of the
+LAMMPS executable is either ``lmp`` or ``lmp_<machine>`` with
+`<machine>` being the machine string used when compiling LAMMPS.  This
+is required when compiling LAMMPS with the traditional build system
+(e.g. with ``make mpi``), but optional when using CMake to configure and
+build LAMMPS:
 
 .. code-block:: bash
 
    $ lmp_serial -in in.file
    $ lmp_serial < in.file
+   $ lmp -in in.file
+   $ lmp < in.file
    $ /path/to/lammps/src/lmp_serial -i in.file
    $ mpirun -np 4 lmp_mpi -in in.file
+   $ mpiexec -np 4 lmp -in in.file
    $ mpirun -np 8 /path/to/lammps/src/lmp_mpi -in in.file
-   $ mpirun -np 6 /usr/local/bin/lmp -in in.file
+   $ mpiexec -n 6 /usr/local/bin/lmp -in in.file
 
 You normally run the LAMMPS command in the directory where your input
 script is located.  That is also where output files are produced by
@@ -23,7 +31,7 @@ executable itself can be placed elsewhere.
 .. note::
 
    The redirection operator "<" will not always work when running
-   in parallel with mpirun; for those systems the -in form is required.
+   in parallel with mpirun or mpiexec; for those systems the -in form is required.
 
 As LAMMPS runs it prints info to the screen and a logfile named
 *log.lammps*\ .  More info about output is given on the
diff --git a/doc/src/group.rst b/doc/src/group.rst
index e72eeb7c19..36559ba68a 100644
--- a/doc/src/group.rst
+++ b/doc/src/group.rst
@@ -38,7 +38,7 @@ Syntax
        *intersect* args = two or more group IDs
        *dynamic* args = parent-ID keyword value ...
          one or more keyword/value pairs may be appended
-         keyword = *region* or *var* or *every*
+         keyword = *region* or *var* or *property* or *every*
            *region* value = region-ID
            *var* value = name of variable
            *property* value = name of custom integer or floating point vector
diff --git a/doc/src/img/decomp-balance.png b/doc/src/img/decomp-balance.png
new file mode 100644
index 0000000000..eb00e8e89a
Binary files /dev/null and b/doc/src/img/decomp-balance.png differ
diff --git a/doc/src/img/decomp-processors.png b/doc/src/img/decomp-processors.png
new file mode 100644
index 0000000000..0d68f3679f
Binary files /dev/null and b/doc/src/img/decomp-processors.png differ
diff --git a/doc/src/img/decomp-rcb.png b/doc/src/img/decomp-rcb.png
new file mode 100644
index 0000000000..0e38efa7ea
Binary files /dev/null and b/doc/src/img/decomp-rcb.png differ
diff --git a/doc/src/img/decomp-regular.png b/doc/src/img/decomp-regular.png
new file mode 100644
index 0000000000..a8c645033d
Binary files /dev/null and b/doc/src/img/decomp-regular.png differ
diff --git a/doc/src/img/domain-decomp.png b/doc/src/img/domain-decomp.png
new file mode 100644
index 0000000000..a0a5cc06f2
Binary files /dev/null and b/doc/src/img/domain-decomp.png differ
diff --git a/doc/src/img/fft-decomp-parallel.png b/doc/src/img/fft-decomp-parallel.png
new file mode 100644
index 0000000000..80f69bd033
Binary files /dev/null and b/doc/src/img/fft-decomp-parallel.png differ
diff --git a/doc/src/img/ghost-comm.png b/doc/src/img/ghost-comm.png
new file mode 100644
index 0000000000..a402daa054
Binary files /dev/null and b/doc/src/img/ghost-comm.png differ
diff --git a/doc/src/img/neigh-stencil.png b/doc/src/img/neigh-stencil.png
new file mode 100644
index 0000000000..7d06f6ae14
Binary files /dev/null and b/doc/src/img/neigh-stencil.png differ
diff --git a/doc/utils/requirements.txt b/doc/utils/requirements.txt
index 7e4563a1ec..f367727d20 100644
--- a/doc/utils/requirements.txt
+++ b/doc/utils/requirements.txt
@@ -1,7 +1,7 @@
 Sphinx==4.0.3
-sphinxcontrib-spelling
+sphinxcontrib-spelling==7.2.1
 git+git://github.com/akohlmey/sphinx-fortran@parallel-read
-sphinx_tabs
-breathe
-Pygments
-six
+sphinx_tabs==3.2.0
+breathe==4.31.0
+Pygments==2.10.0
+six==1.16.0
diff --git a/examples/PACKAGES/charge_regulation/in.chreg-polymer b/examples/PACKAGES/charge_regulation/in.chreg-polymer
index 0adab9b5e7..055032c018 100644
--- a/examples/PACKAGES/charge_regulation/in.chreg-polymer
+++ b/examples/PACKAGES/charge_regulation/in.chreg-polymer
@@ -8,7 +8,7 @@ bond_style      harmonic
 bond_coeff      1 100 1.122462 # K R0
 velocity        all create 1.0 8008 loop geom
 
-pair_style      lj/cut/coul/long 1.122462 20
+pair_style      lj/cut/coul/long/soft 2 0.5 10.0  1.122462 20
 pair_coeff      * *  1.0 1.0 1.122462 # charges
 kspace_style    pppm 1.0e-3
 pair_modify     shift yes
diff --git a/lib/colvars/colvarmodule.cpp b/lib/colvars/colvarmodule.cpp
index 405c68244b..ee14703726 100644
--- a/lib/colvars/colvarmodule.cpp
+++ b/lib/colvars/colvarmodule.cpp
@@ -1476,7 +1476,9 @@ int colvarmodule::write_output_files()
        bi != biases.end();
        bi++) {
     // Only write output files if they have not already been written this time step
-    if ((*bi)->output_freq == 0 || (cvm::step_absolute() % (*bi)->output_freq) != 0) {
+    if ((*bi)->output_freq == 0    ||
+        cvm::step_relative() == 0  ||
+        (cvm::step_absolute() % (*bi)->output_freq) != 0) {
       error_code |= (*bi)->write_output_files();
     }
     error_code |= (*bi)->write_state_to_replicas();
diff --git a/lib/colvars/colvars_version.h b/lib/colvars/colvars_version.h
index dd56c39f3a..3f050c7e7c 100644
--- a/lib/colvars/colvars_version.h
+++ b/lib/colvars/colvars_version.h
@@ -1,3 +1,3 @@
 #ifndef COLVARS_VERSION
-#define COLVARS_VERSION "2021-08-06"
+#define COLVARS_VERSION "2021-09-21"
 #endif
diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index e5efc239da..003b4b3ba7 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -556,16 +556,22 @@ void UCL_Device::add_properties(cl_device_id device_list) {
                                sizeof(float_width),&float_width,nullptr));
   op.preferred_vector_width32=float_width;
 
-  // Determine if double precision is supported
   cl_uint double_width;
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
                                sizeof(double_width),&double_width,nullptr));
   op.preferred_vector_width64=double_width;
-  if (double_width==0)
-    op.double_precision=false;
-  else
+
+  // Determine if double precision is supported: All bits in the mask must be set.
+  cl_device_fp_config double_mask = (CL_FP_FMA|CL_FP_ROUND_TO_NEAREST|CL_FP_ROUND_TO_ZERO|
+                                     CL_FP_ROUND_TO_INF|CL_FP_INF_NAN|CL_FP_DENORM);
+  cl_device_fp_config double_avail;
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_DOUBLE_FP_CONFIG,
+                               sizeof(double_avail),&double_avail,nullptr));
+  if ((double_avail & double_mask) == double_mask)
     op.double_precision=true;
+  else
+    op.double_precision=false;
 
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                CL_DEVICE_PROFILING_TIMER_RESOLUTION,
diff --git a/lib/gpu/lal_born_coul_long.cpp b/lib/gpu/lal_born_coul_long.cpp
index 8c7084f4a4..36f46f2684 100644
--- a/lib/gpu/lal_born_coul_long.cpp
+++ b/lib/gpu/lal_born_coul_long.cpp
@@ -34,7 +34,7 @@ BornCoulLongT::BornCoulLong() : BaseCharge<numtyp,acctyp>(),
 }
 
 template <class numtyp, class acctyp>
-BornCoulLongT::~BornCoulLongT() {
+BornCoulLongT::~BornCoulLong() {
   clear();
 }
 
diff --git a/lib/gpu/lal_born_coul_wolf.cpp b/lib/gpu/lal_born_coul_wolf.cpp
index 9aac866353..9a0653dd23 100644
--- a/lib/gpu/lal_born_coul_wolf.cpp
+++ b/lib/gpu/lal_born_coul_wolf.cpp
@@ -34,7 +34,7 @@ BornCoulWolfT::BornCoulWolf() : BaseCharge<numtyp,acctyp>(),
 }
 
 template <class numtyp, class acctyp>
-BornCoulWolfT::~BornCoulWolfT() {
+BornCoulWolfT::~BornCoulWolf() {
   clear();
 }
 
diff --git a/lib/gpu/lal_buck_coul_long.cpp b/lib/gpu/lal_buck_coul_long.cpp
index 60205a2ad6..98c97ea908 100644
--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@@ -34,7 +34,7 @@ BuckCoulLongT::BuckCoulLong() : BaseCharge<numtyp,acctyp>(),
 }
 
 template <class numtyp, class acctyp>
-BuckCoulLongT::~BuckCoulLongT() {
+BuckCoulLongT::~BuckCoulLong() {
   clear();
 }
 
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 9dbd02dd3e..59eac78483 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -333,6 +333,12 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
     gpu_barrier();
   }
 
+  // check if double precision support is available
+  #if defined(_SINGLE_DOUBLE) || defined(_DOUBLE_DOUBLE)
+  if (!gpu->double_precision())
+    return -16;
+  #endif
+
   // Setup auto bin size calculation for calls from atom::sort
   // - This is repeated in neighbor init with additional info
   if (_user_cell_size<0.0) {
@@ -546,14 +552,9 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
     return -3;
 
   if (_user_cell_size<0.0) {
-    #ifndef LAL_USE_OLD_NEIGHBOR
-    _neighbor_shared.setup_auto_cell_size(true,cutoff,nbor->simd_size());
-    #else
     _neighbor_shared.setup_auto_cell_size(false,cutoff,nbor->simd_size());
-    #endif
   } else
-    _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,
-                                          nbor->simd_size());
+    _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,nbor->simd_size());
   nbor->set_cutoff(cutoff);
 
   return 0;
diff --git a/lib/pace/Makefile b/lib/pace/Makefile
index ac9f3a3151..c2e1892ddd 100644
--- a/lib/pace/Makefile
+++ b/lib/pace/Makefile
@@ -2,8 +2,8 @@ SHELL = /bin/sh
 
 # ------ FILES ------
 
-SRC_FILES = $(wildcard src/ML-PACE/*.cpp)
-SRC = $(filter-out src/ML-PACE/pair_pace.cpp, $(SRC_FILES))
+SRC_FILES = $(wildcard src/USER-PACE/*.cpp)
+SRC = $(filter-out src/USER-PACE/pair_pace.cpp, $(SRC_FILES))
 
 # ------ DEFINITIONS ------
 
@@ -12,7 +12,7 @@ OBJ =   $(SRC:.cpp=.o)
 
 
 # ------ SETTINGS ------
-CXXFLAGS = -O3 -fPIC -Isrc/ML-PACE
+CXXFLAGS = -O3 -fPIC -Isrc/USER-PACE
 
 ARCHIVE =	ar
 ARCHFLAG =	-rc
diff --git a/lib/pace/Makefile.lammps b/lib/pace/Makefile.lammps
index 89761c1b4b..17820716df 100644
--- a/lib/pace/Makefile.lammps
+++ b/lib/pace/Makefile.lammps
@@ -1,3 +1,3 @@
-pace_SYSINC =-I../../lib/pace/src/ML-PACE
+pace_SYSINC =-I../../lib/pace/src/USER-PACE
 pace_SYSLIB = -L../../lib/pace/ -lpace
 pace_SYSPATH =
diff --git a/src/.gitignore b/src/.gitignore
index 6c0a838c1b..174ee35be5 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -858,8 +858,6 @@
 /fix_ti_rs.h
 /fix_ti_spring.cpp
 /fix_ti_spring.h
-/fix_ttm.cpp
-/fix_ttm.h
 /fix_tune_kspace.cpp
 /fix_tune_kspace.h
 /fix_wall_body_polygon.cpp
@@ -919,6 +917,7 @@
 /improper_ring.h
 /improper_umbrella.cpp
 /improper_umbrella.h
+/interlayer_taper.h
 /kissfft.h
 /lj_sdk_common.h
 /math_complex.h
@@ -933,7 +932,6 @@
 /msm_cg.h
 /neb.cpp
 /neb.h
-
 /pair_adp.cpp
 /pair_adp.h
 /pair_agni.cpp
@@ -994,6 +992,8 @@
 /pair_cosine_squared.h
 /pair_coul_diel.cpp
 /pair_coul_diel.h
+/pair_coul_exclude.cpp
+/pair_coul_exclude.h
 /pair_coul_long.cpp
 /pair_coul_long.h
 /pair_coul_msm.cpp
@@ -1431,6 +1431,10 @@
 /fix_srp.h
 /fix_tfmc.cpp
 /fix_tfmc.h
+/fix_ttm.cpp
+/fix_ttm.h
+/fix_ttm_grid.cpp
+/fix_ttm_grid.h
 /fix_ttm_mod.cpp
 /fix_ttm_mod.h
 /pair_born_coul_long_cs.cpp
diff --git a/src/DRUDE/fix_drude_transform.cpp b/src/DRUDE/fix_drude_transform.cpp
index ed42d2b548..3f8b0cfe26 100644
--- a/src/DRUDE/fix_drude_transform.cpp
+++ b/src/DRUDE/fix_drude_transform.cpp
@@ -13,16 +13,18 @@
 ------------------------------------------------------------------------- */
 
 /** Fix Drude Transform ******************************************************/
+
 #include "fix_drude_transform.h"
 
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "fix_drude.h"
+#include "modify.h"
+
 #include <cmath>
 #include <cstring>
-#include "fix_drude.h"
-#include "atom.h"
-#include "domain.h"
-#include "comm.h"
-#include "error.h"
-#include "modify.h"
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
diff --git a/src/DRUDE/fix_drude_transform.h b/src/DRUDE/fix_drude_transform.h
index 7ee85d2b42..495ec8b175 100644
--- a/src/DRUDE/fix_drude_transform.h
+++ b/src/DRUDE/fix_drude_transform.h
@@ -25,10 +25,10 @@ FixStyle(drude/transform/inverse,FixDrudeTransform<true>);
 
 namespace LAMMPS_NS {
 
-template <bool inverse> class FixDrudeTransform : public Fix {
+template <bool inverse> class FixDrudeTransform: public Fix {
  public:
-  FixDrudeTransform<inverse>(class LAMMPS *, int, char **);
-  ~FixDrudeTransform<inverse>();
+  FixDrudeTransform(class LAMMPS *, int, char **);
+  ~FixDrudeTransform();
   int setmask();
   void init();
   void setup(int vflag);
diff --git a/src/EXTRA-FIX/fix_npt_cauchy.cpp b/src/EXTRA-FIX/fix_npt_cauchy.cpp
index 07077cdabb..d9a0f850db 100644
--- a/src/EXTRA-FIX/fix_npt_cauchy.cpp
+++ b/src/EXTRA-FIX/fix_npt_cauchy.cpp
@@ -2442,7 +2442,7 @@ double FixNPTCauchy::memory_usage()
 void FixNPTCauchy::CauchyStat_init()
 {
   if (comm->me == 0) {
-    std::string mesg = fmt::format("Using fix npt/cauchy with alpha={:f.8}\n",alpha);
+    std::string mesg = fmt::format("Using fix npt/cauchy with alpha={:.8f}\n",alpha);
     if (restartPK==1) {
       mesg += "   (this is a continuation run)\n";
     } else {
@@ -2463,7 +2463,7 @@ void FixNPTCauchy::CauchyStat_init()
     error->all(FLERR,"Illegal fix npt/cauchy command: Alpha cannot be zero or negative.");
 
   if (restart_stored < 0) {
-    modify->add_fix(std::string(id_store) + "all STORE global 1 6");
+    modify->add_fix(std::string(id_store) + " all STORE global 1 6");
     restart_stored = modify->find_fix(id_store);
   }
   init_store = (FixStore *)modify->fix[restart_stored];
diff --git a/src/EXTRA-PAIR/pair_coul_exclude.cpp b/src/EXTRA-PAIR/pair_coul_exclude.cpp
index 404fc9c784..74890bcf08 100644
--- a/src/EXTRA-PAIR/pair_coul_exclude.cpp
+++ b/src/EXTRA-PAIR/pair_coul_exclude.cpp
@@ -189,7 +189,7 @@ void PairCoulExclude::init_style()
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
 
-double PairCoulExclude::init_one(int i, int j)
+double PairCoulExclude::init_one(int /*i*/, int /*j*/)
 {
   return cut_global;
 }
diff --git a/src/GPU/gpu_extra.h b/src/GPU/gpu_extra.h
index bd55d11021..c2d9931e0a 100644
--- a/src/GPU/gpu_extra.h
+++ b/src/GPU/gpu_extra.h
@@ -78,7 +78,11 @@ inline void check_flag(int error_flag, LAMMPS_NS::Error *error, MPI_Comm &world)
     else if (all_success == -13)
       error->all(FLERR, "Invalid device configuration.");
     else if (all_success == -15)
-      error->all(FLERR, "P3M built for FP64 and GPU device is FP32 only.");
+      error->all(FLERR, "PPPM was compiled for double precision floating point "
+                 "but GPU device supports single precision only.");
+    else if (all_success == -16)
+      error->all(FLERR, "GPU library was compiled for double or mixed precision "
+                 "floating point but GPU device supports single precision only.");
     else
       error->all(FLERR, "Unknown error in GPU library");
   }
diff --git a/src/H5MD/dump_h5md.cpp b/src/H5MD/dump_h5md.cpp
index 43219fb035..bc9c98caa0 100644
--- a/src/H5MD/dump_h5md.cpp
+++ b/src/H5MD/dump_h5md.cpp
@@ -28,12 +28,12 @@
 #include "update.h"
 #include "version.h"
 
+#include "ch5md.h"
+
 #include <climits>
 #include <cmath>
 #include <cstring>
 
-#include "ch5md.h"
-
 using namespace LAMMPS_NS;
 
 #define MYMIN(a,b) ((a) < (b) ? (a) : (b))
diff --git a/src/KIM/kim_init.cpp b/src/KIM/kim_init.cpp
index ea3dc46da4..a4003b7510 100644
--- a/src/KIM/kim_init.cpp
+++ b/src/KIM/kim_init.cpp
@@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
@@ -85,21 +84,23 @@ void KimInit::command(int narg, char **arg)
   if ((narg < 2) || (narg > 3)) error->all(FLERR, "Illegal 'kim init' command");
 
   if (domain->box_exist)
-    error->all(FLERR, "Must use 'kim init' command before "
-                      "simulation box is defined");
+    error->all(FLERR, "Must use 'kim init' command before simulation box is defined");
 
   char *model_name = utils::strdup(arg[0]);
   char *user_units = utils::strdup(arg[1]);
   if (narg == 3) {
     auto arg_str = std::string(arg[2]);
-    if (arg_str == "unit_conversion_mode") unit_conversion_mode = true;
+    if (arg_str == "unit_conversion_mode")
+      unit_conversion_mode = true;
     else {
-      error->all(FLERR, "Illegal 'kim init' command.\nThe argument "
-                                    "followed by unit_style {} is an optional "
-                                    "argument and when is used must "
-                                    "be unit_conversion_mode", user_units);
+      error->all(FLERR,
+                 "Illegal 'kim init' command.\n"
+                 "The argument followed by unit_style {} is an optional argument and when "
+                 "is used must be unit_conversion_mode",
+                 user_units);
     }
-  } else unit_conversion_mode = false;
+  } else
+    unit_conversion_mode = false;
 
   char *model_units;
   KIM_Model *pkim = nullptr;
@@ -117,14 +118,9 @@ void KimInit::command(int narg, char **arg)
 /* ---------------------------------------------------------------------- */
 
 namespace {
-void get_kim_unit_names(
-    char const * const system,
-    KIM_LengthUnit & lengthUnit,
-    KIM_EnergyUnit & energyUnit,
-    KIM_ChargeUnit & chargeUnit,
-    KIM_TemperatureUnit & temperatureUnit,
-    KIM_TimeUnit & timeUnit,
-    Error * error)
+void get_kim_unit_names(char const *const system, KIM_LengthUnit &lengthUnit,
+                        KIM_EnergyUnit &energyUnit, KIM_ChargeUnit &chargeUnit,
+                        KIM_TemperatureUnit &temperatureUnit, KIM_TimeUnit &timeUnit, Error *error)
 {
   const std::string system_str(system);
   if (system_str == "real") {
@@ -157,20 +153,64 @@ void get_kim_unit_names(
     chargeUnit = KIM_CHARGE_UNIT_e;
     temperatureUnit = KIM_TEMPERATURE_UNIT_K;
     timeUnit = KIM_TIME_UNIT_fs;
-  } else if ((system_str == "lj") ||
-             (system_str == "micro") ||
-             (system_str == "nano")) {
-    error->all(FLERR, "LAMMPS unit_style {} not supported "
-                                  "by KIM models", system_str);
+  } else if ((system_str == "lj") || (system_str == "micro") || (system_str == "nano")) {
+    error->all(FLERR, "LAMMPS unit_style {} not supported by KIM models", system_str);
   } else {
     error->all(FLERR, "Unknown unit_style");
   }
 }
-}  // namespace
+}    // namespace
 
-void KimInit::determine_model_type_and_units(char * model_name,
-                                             char * user_units,
-                                             char ** model_units,
+void KimInit::print_dirs(struct KIM_Collections *const collections) const
+{
+  int kim_error = 0;
+  int dirListExtent = 0;
+  int dirCounter = 0;
+
+  std::string mesg = "#=== KIM is looking for 'Portable Models' in these directories ===\n";
+  std::vector<struct KIM_Collection> collection_list;
+  collection_list.push_back(KIM_COLLECTION_currentWorkingDirectory);
+  collection_list.push_back(KIM_COLLECTION_environmentVariable);
+  collection_list.push_back(KIM_COLLECTION_user);
+  collection_list.push_back(KIM_COLLECTION_system);
+
+  for (auto col : collection_list) {
+    kim_error = KIM_Collections_CacheListOfDirectoryNames(
+        collections, col, KIM_COLLECTION_ITEM_TYPE_portableModel, &dirListExtent);
+    if (!kim_error) {
+      for (int i = 0; i < dirListExtent; ++i) {
+        char const *name;
+        kim_error = KIM_Collections_GetDirectoryName(collections, i, &name);
+        // Don't check for error due to bug in kim-api-2.2.1 and below.
+#if ((KIM_VERSION_MAJOR * 1000 + KIM_VERSION_MINOR) * 1000 + KIM_VERSION_PATCH) <= 2002001
+        kim_error = 0;
+#endif
+        if (!kim_error) mesg += fmt::format("# {:2}: {}\n", ++dirCounter, name);
+      }
+    }
+  }
+
+  dirCounter = 0;
+  mesg += "#=== KIM is looking for 'Simulator Models' in these directories ===\n";
+  for (auto col : collection_list) {
+    kim_error = KIM_Collections_CacheListOfDirectoryNames(
+        collections, col, KIM_COLLECTION_ITEM_TYPE_simulatorModel, &dirListExtent);
+    if (!kim_error) {
+      for (int i = 0; i < dirListExtent; ++i) {
+        char const *name;
+        kim_error = KIM_Collections_GetDirectoryName(collections, i, &name);
+        // Don't check for error due to bug in kim-api-2.2.1 and below.
+#if ((KIM_VERSION_MAJOR * 1000 + KIM_VERSION_MINOR) * 1000 + KIM_VERSION_PATCH) <= 2002001
+        kim_error = 0;
+#endif
+        if (!kim_error) mesg += fmt::format("# {:2}: {}\n", ++dirCounter, name);
+      }
+    }
+  }
+  input->write_echo(mesg);
+}
+
+void KimInit::determine_model_type_and_units(char *model_name, char *user_units, char **model_units,
                                              KIM_Model *&pkim)
 {
   KIM_LengthUnit lengthUnit;
@@ -179,33 +219,26 @@ void KimInit::determine_model_type_and_units(char * model_name,
   KIM_TemperatureUnit temperatureUnit;
   KIM_TimeUnit timeUnit;
   int units_accepted;
-  KIM_Collections * collections;
+  KIM_Collections *collections;
   KIM_CollectionItemType itemType;
 
   int kim_error = KIM_Collections_Create(&collections);
-  if (kim_error)
-    error->all(FLERR, "Unable to access KIM Collections to find Model");
+  if (kim_error) error->all(FLERR, "Unable to access KIM Collections to find Model");
 
   auto logID = fmt::format("{}_Collections", comm->me);
   KIM_Collections_SetLogID(collections, logID.c_str());
 
+  print_dirs(collections);
+
   kim_error = KIM_Collections_GetItemType(collections, model_name, &itemType);
   if (kim_error) error->all(FLERR, "KIM Model name not found");
   KIM_Collections_Destroy(&collections);
 
-  if (KIM_CollectionItemType_Equal(itemType,
-                                   KIM_COLLECTION_ITEM_TYPE_portableModel)) {
-    get_kim_unit_names(user_units, lengthUnit, energyUnit,
-                       chargeUnit, temperatureUnit, timeUnit, error);
-    int kim_error = KIM_Model_Create(KIM_NUMBERING_zeroBased,
-                                     lengthUnit,
-                                     energyUnit,
-                                     chargeUnit,
-                                     temperatureUnit,
-                                     timeUnit,
-                                     model_name,
-                                     &units_accepted,
-                                     &pkim);
+  if (KIM_CollectionItemType_Equal(itemType, KIM_COLLECTION_ITEM_TYPE_portableModel)) {
+    get_kim_unit_names(user_units, lengthUnit, energyUnit, chargeUnit, temperatureUnit, timeUnit,
+                       error);
+    int kim_error = KIM_Model_Create(KIM_NUMBERING_zeroBased, lengthUnit, energyUnit, chargeUnit,
+                                     temperatureUnit, timeUnit, model_name, &units_accepted, &pkim);
 
     if (kim_error) error->all(FLERR, "Unable to load KIM Simulator Model");
 
@@ -219,20 +252,12 @@ void KimInit::determine_model_type_and_units(char * model_name,
     } else if (unit_conversion_mode) {
       KIM_Model_Destroy(&pkim);
       int const num_systems = 5;
-      char const * const systems[num_systems]
-          = {"metal", "real", "si", "cgs", "electron"};
-      for (int i=0; i < num_systems; ++i) {
-        get_kim_unit_names(systems[i], lengthUnit, energyUnit,
-                           chargeUnit, temperatureUnit, timeUnit, error);
-        kim_error = KIM_Model_Create(KIM_NUMBERING_zeroBased,
-                                     lengthUnit,
-                                     energyUnit,
-                                     chargeUnit,
-                                     temperatureUnit,
-                                     timeUnit,
-                                     model_name,
-                                     &units_accepted,
-                                     &pkim);
+      char const *const systems[num_systems] = {"metal", "real", "si", "cgs", "electron"};
+      for (int i = 0; i < num_systems; ++i) {
+        get_kim_unit_names(systems[i], lengthUnit, energyUnit, chargeUnit, temperatureUnit,
+                           timeUnit, error);
+        kim_error = KIM_Model_Create(KIM_NUMBERING_zeroBased, lengthUnit, energyUnit, chargeUnit,
+                                     temperatureUnit, timeUnit, model_name, &units_accepted, &pkim);
         if (units_accepted) {
           logID = fmt::format("{}_Model", comm->me);
           KIM_Model_SetLogID(pkim, logID.c_str());
@@ -246,12 +271,10 @@ void KimInit::determine_model_type_and_units(char * model_name,
       KIM_Model_Destroy(&pkim);
       error->all(FLERR, "KIM Model does not support the requested unit system");
     }
-  } else if (KIM_CollectionItemType_Equal(
-             itemType, KIM_COLLECTION_ITEM_TYPE_simulatorModel)) {
-    KIM_SimulatorModel * simulatorModel;
+  } else if (KIM_CollectionItemType_Equal(itemType, KIM_COLLECTION_ITEM_TYPE_simulatorModel)) {
+    KIM_SimulatorModel *simulatorModel;
     kim_error = KIM_SimulatorModel_Create(model_name, &simulatorModel);
-    if (kim_error)
-      error->all(FLERR, "Unable to load KIM Simulator Model");
+    if (kim_error) error->all(FLERR, "Unable to load KIM Simulator Model");
     model_type = SM;
 
     logID = fmt::format("{}_SimulatorModel", comm->me);
@@ -264,13 +287,11 @@ void KimInit::determine_model_type_and_units(char * model_name,
     KIM_SimulatorModel_GetNumberOfSimulatorFields(simulatorModel, &sim_fields);
     KIM_SimulatorModel_CloseTemplateMap(simulatorModel);
     for (int i = 0; i < sim_fields; ++i) {
-      KIM_SimulatorModel_GetSimulatorFieldMetadata(
-        simulatorModel, i, &sim_lines, &sim_field);
+      KIM_SimulatorModel_GetSimulatorFieldMetadata(simulatorModel, i, &sim_lines, &sim_field);
 
       const std::string sim_field_str(sim_field);
       if (sim_field_str == "units") {
-        KIM_SimulatorModel_GetSimulatorFieldLine(
-          simulatorModel, i, 0, &sim_value);
+        KIM_SimulatorModel_GetSimulatorFieldLine(simulatorModel, i, 0, &sim_value);
         *model_units = utils::strdup(sim_value);
         break;
       }
@@ -280,16 +301,15 @@ void KimInit::determine_model_type_and_units(char * model_name,
     const std::string model_units_str(*model_units);
     const std::string user_units_str(user_units);
     if ((!unit_conversion_mode) && (model_units_str != user_units_str)) {
-      error->all(FLERR, "Incompatible units for KIM Simulator Model"
-                                    ", required units = {}", model_units_str);
+      error->all(FLERR, "Incompatible units for KIM Simulator Model, required units = {}",
+                 model_units_str);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
-void KimInit::do_init(char *model_name, char *user_units, char *model_units,
-                      KIM_Model *&pkim)
+void KimInit::do_init(char *model_name, char *user_units, char *model_units, KIM_Model *&pkim)
 {
   // create storage proxy fix. delete existing fix, if needed.
 
@@ -304,8 +324,7 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
   fix_store->setptr("model_units", (void *) model_units);
 
   // Begin output to log file
-  input->write_echo("#=== BEGIN kim init ==================================="
-                    "=======\n");
+  input->write_echo("#=== BEGIN kim init ==========================================\n");
 
   KIM_SimulatorModel *simulatorModel;
   if (model_type == SM) {
@@ -316,18 +335,16 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
     KIM_SimulatorModel_SetLogID(simulatorModel, logID.c_str());
 
     char const *sim_name, *sim_version;
-    KIM_SimulatorModel_GetSimulatorNameAndVersion(
-        simulatorModel, &sim_name, &sim_version);
+    KIM_SimulatorModel_GetSimulatorNameAndVersion(simulatorModel, &sim_name, &sim_version);
 
     const std::string sim_name_str(sim_name);
-    if (sim_name_str != "LAMMPS")
-      error->all(FLERR, "Incompatible KIM Simulator Model");
+    if (sim_name_str != "LAMMPS") error->all(FLERR, "Incompatible KIM Simulator Model");
 
     if (comm->me == 0) {
       auto mesg = fmt::format("# Using KIM Simulator Model : {}\n"
-        "# For Simulator             : {} {}\n"
-        "# Running on                : LAMMPS {}\n#\n", model_name,
-        sim_name_str, sim_version, lmp->version);
+                              "# For Simulator             : {} {}\n"
+                              "# Running on                : LAMMPS {}\n#\n",
+                              model_name, sim_name_str, sim_version, lmp->version);
       utils::logmesg(lmp, mesg);
     }
 
@@ -350,18 +367,16 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
   // Set the skin and timestep default values as
   // 2.0 Angstroms and 1.0 femtosecond
 
-  const std::string skin_cmd =
-    (model_units_str == "real") ? "neighbor 2.0 bin   # Angstroms":
-    (model_units_str == "metal") ? "neighbor 2.0 bin   # Angstroms":
-    (model_units_str == "si") ? "neighbor 2e-10 bin   # meters":
-    (model_units_str == "cgs") ? "neighbor 2e-8 bin   # centimeters":
-    "neighbor 3.77945224 bin   # Bohr";
-  const std::string step_cmd =
-    (model_units_str == "real") ? "timestep 1.0       # femtoseconds":
-    (model_units_str == "metal") ? "timestep 1.0e-3    # picoseconds":
-    (model_units_str == "si") ? "timestep 1e-15       # seconds":
-    (model_units_str == "cgs") ? "timestep 1e-15      # seconds":
-    "timestep 1.0              # femtoseconds";
+  const std::string skin_cmd = (model_units_str == "real") ? "neighbor 2.0 bin   # Angstroms"
+      : (model_units_str == "metal")                       ? "neighbor 2.0 bin   # Angstroms"
+      : (model_units_str == "si")                          ? "neighbor 2e-10 bin   # meters"
+      : (model_units_str == "cgs")                         ? "neighbor 2e-8 bin   # centimeters"
+                                                           : "neighbor 3.77945224 bin   # Bohr";
+  const std::string step_cmd = (model_units_str == "real") ? "timestep 1.0       # femtoseconds"
+      : (model_units_str == "metal")                       ? "timestep 1.0e-3    # picoseconds"
+      : (model_units_str == "si")                          ? "timestep 1e-15       # seconds"
+      : (model_units_str == "cgs")                         ? "timestep 1e-15      # seconds"
+                                   : "timestep 1.0              # femtoseconds";
   input->one(skin_cmd);
   input->one(step_cmd);
 
@@ -373,14 +388,12 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
     // init model
 
     for (int i = 0; i < sim_fields; ++i) {
-      KIM_SimulatorModel_GetSimulatorFieldMetadata(
-        simulatorModel, i, &sim_lines, &sim_field);
+      KIM_SimulatorModel_GetSimulatorFieldMetadata(simulatorModel, i, &sim_lines, &sim_field);
 
       const std::string sim_field_str(sim_field);
       if (sim_field_str == "model-init") {
         for (int j = 0; j < sim_lines; ++j) {
-          KIM_SimulatorModel_GetSimulatorFieldLine(
-            simulatorModel, i, j, &sim_value);
+          KIM_SimulatorModel_GetSimulatorFieldLine(simulatorModel, i, j, &sim_value);
           input->one(sim_value);
         }
         break;
@@ -404,31 +417,28 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
 
       int max_len(0);
       for (int i = 0; i < numberOfParameters; ++i) {
-        KIM_Model_GetParameterMetadata(pkim, i, &kim_DataType,
-        &extent, &str_name, &str_desc);
-        max_len = MAX(max_len, (int)strlen(str_name));
+        KIM_Model_GetParameterMetadata(pkim, i, &kim_DataType, &extent, &str_name, &str_desc);
+        max_len = MAX(max_len, (int) strlen(str_name));
       }
       max_len = MAX(18, max_len + 1);
-      mesg += fmt::format(" No.      | {:<{}} | data type  | extent\n",
-                          "Parameter name", max_len);
+      mesg += fmt::format(" No.      | {:<{}} | data type  | extent\n", "Parameter name", max_len);
       mesg += fmt::format("{:-<{}}\n", "-", max_len + 35);
       for (int i = 0; i < numberOfParameters; ++i) {
-        KIM_Model_GetParameterMetadata(pkim, i, &kim_DataType,
-        &extent, &str_name, &str_desc);
+        KIM_Model_GetParameterMetadata(pkim, i, &kim_DataType, &extent, &str_name, &str_desc);
         auto data_type = std::string("\"");
         data_type += KIM_DataType_ToString(kim_DataType) + std::string("\"");
-        mesg += fmt::format(" {:<8} | {:<{}} | {:<10} | {}\n", i + 1, str_name,
-                            max_len, data_type, extent);
+        mesg += fmt::format(" {:<8} | {:<{}} | {:<10} | {}\n", i + 1, str_name, max_len, data_type,
+                            extent);
       }
-    } else mesg += "No mutable parameters.\n";
+    } else
+      mesg += "No mutable parameters.\n";
 
     KIM_Model_Destroy(&pkim);
     input->write_echo(mesg);
   }
 
   // End output to log file
-  input->write_echo("#=== END kim init ====================================="
-                    "=======\n\n");
+  input->write_echo("#=== END kim init ============================================\n\n");
 }
 
 /* ---------------------------------------------------------------------- */
@@ -446,24 +456,11 @@ void KimInit::do_variables(const std::string &from, const std::string &to)
   int ier;
   std::string var_str;
   int v_unit;
-  const char *units[] = {"mass",
-                         "distance",
-                         "time",
-                         "energy",
-                         "velocity",
-                         "force",
-                         "torque",
-                         "temperature",
-                         "pressure",
-                         "viscosity",
-                         "charge",
-                         "dipole",
-                         "efield",
-                         "density",
-                         nullptr};
+  const char *units[] = {"mass",   "distance", "time",        "energy",   "velocity",
+                         "force",  "torque",   "temperature", "pressure", "viscosity",
+                         "charge", "dipole",   "efield",      "density",  nullptr};
 
-  input->write_echo(fmt::format("# Conversion factors from {} to {}:\n",
-                                from, to));
+  input->write_echo(fmt::format("# Conversion factors from {} to {}:\n", from, to));
 
   auto variable = input->variable;
   for (int i = 0; units[i] != nullptr; ++i) {
@@ -473,24 +470,23 @@ void KimInit::do_variables(const std::string &from, const std::string &to)
       variable->set(var_str + " internal 1.0");
       v_unit = variable->find(var_str.c_str());
     }
-    ier = lammps_unit_conversion(units[i], from, to,
-                                 conversion_factor);
+    ier = lammps_unit_conversion(units[i], from, to, conversion_factor);
     if (ier != 0)
-      error->all(FLERR, "Unable to obtain conversion factor: "
-                                    "unit = {}; from = {}; to = {}",
-                                    units[i], from, to);
+      error->all(FLERR,
+                 "Unable to obtain conversion factor: "
+                 "unit = {}; from = {}; to = {}",
+                 units[i], from, to);
 
     variable->internal_set(v_unit, conversion_factor);
-    input->write_echo(fmt::format("variable {:<15s} internal {:<15.12e}\n",
-                                  var_str, conversion_factor));
+    input->write_echo(
+        fmt::format("variable {:<15s} internal {:<15.12e}\n", var_str, conversion_factor));
   }
   input->write_echo("#\n");
 }
 
 /* ---------------------------------------------------------------------- */
 
-void KimInit::write_log_cite(class LAMMPS *lmp,
-                             KimInit::model_type_enum model_type,
+void KimInit::write_log_cite(class LAMMPS *lmp, KimInit::model_type_enum model_type,
                              char *model_name)
 {
   if (!lmp->citeme) return;
@@ -501,7 +497,7 @@ void KimInit::write_log_cite(class LAMMPS *lmp,
 
   std::string cite_id;
   if (kim_id.empty()) {
-    cite_id = fmt::format("KIM potential: unpublished, \"{}\"\n",model_name_str);
+    cite_id = fmt::format("KIM potential: unpublished, \"{}\"\n", model_name_str);
   } else {
     KIM_Collections *collections;
     int err = KIM_Collections_Create(&collections);
@@ -513,12 +509,10 @@ void KimInit::write_log_cite(class LAMMPS *lmp,
     int extent;
     if (model_type == MO) {
       err = KIM_Collections_CacheListOfItemMetadataFiles(
-          collections, KIM_COLLECTION_ITEM_TYPE_portableModel,
-          model_name, &extent);
+          collections, KIM_COLLECTION_ITEM_TYPE_portableModel, model_name, &extent);
     } else if (model_type == SM) {
       err = KIM_Collections_CacheListOfItemMetadataFiles(
-          collections, KIM_COLLECTION_ITEM_TYPE_simulatorModel,
-          model_name, &extent);
+          collections, KIM_COLLECTION_ITEM_TYPE_simulatorModel, model_name, &extent);
     } else {
       lmp->error->all(FLERR, "Unknown model type");
     }
@@ -529,19 +523,18 @@ void KimInit::write_log_cite(class LAMMPS *lmp,
     }
 
     cite_id = fmt::format("OpenKIM potential: https://openkim.org/cite/"
-                          "{}#item-citation\n\n",kim_id);
+                          "{}#item-citation\n\n",
+                          kim_id);
 
     for (int i = 0; i < extent; ++i) {
       char const *fileName;
       int availableAsString;
       char const *fileString;
-      err = KIM_Collections_GetItemMetadataFile(
-          collections, i, &fileName, nullptr, nullptr,
-          &availableAsString, &fileString);
+      err = KIM_Collections_GetItemMetadataFile(collections, i, &fileName, nullptr, nullptr,
+                                                &availableAsString, &fileString);
       if (err) continue;
 
-      if (utils::strmatch(fileName, "^kimcite") && availableAsString)
-        cite_id += fileString;
+      if (utils::strmatch(fileName, "^kimcite") && availableAsString) cite_id += fileString;
     }
     KIM_Collections_Destroy(&collections);
   }
diff --git a/src/KIM/kim_init.h b/src/KIM/kim_init.h
index fa042f2723..56922533ab 100644
--- a/src/KIM/kim_init.h
+++ b/src/KIM/kim_init.h
@@ -62,7 +62,8 @@
 #include "pointers.h"
 
 // Forward declaration.
-typedef struct KIM_Model KIM_Model;
+struct KIM_Model;
+struct KIM_Collections;
 
 namespace LAMMPS_NS {
 
@@ -80,6 +81,8 @@ class KimInit : protected Pointers {
   void determine_model_type_and_units(char *, char *, char **, KIM_Model *&);
   void do_init(char *, char *, char *, KIM_Model *&);
   void do_variables(const std::string &, const std::string &);
+
+  void print_dirs(struct KIM_Collections * const collections) const;
 };
 
 }    // namespace LAMMPS_NS
diff --git a/src/KOKKOS/compute_coord_atom_kokkos.cpp b/src/KOKKOS/compute_coord_atom_kokkos.cpp
index b71cd1ec4a..9f0e7fc435 100644
--- a/src/KOKKOS/compute_coord_atom_kokkos.cpp
+++ b/src/KOKKOS/compute_coord_atom_kokkos.cpp
@@ -59,7 +59,7 @@ ComputeCoordAtomKokkos<DeviceType>::ComputeCoordAtomKokkos(LAMMPS *lmp, int narg
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
-ComputeCoordAtomKokkos<DeviceType>::~ComputeCoordAtomKokkos<DeviceType>()
+ComputeCoordAtomKokkos<DeviceType>::~ComputeCoordAtomKokkos()
 {
   if (copymode) return;
 
diff --git a/src/KOKKOS/fix_rx_kokkos.h b/src/KOKKOS/fix_rx_kokkos.h
index a782958045..61e4a05946 100644
--- a/src/KOKKOS/fix_rx_kokkos.h
+++ b/src/KOKKOS/fix_rx_kokkos.h
@@ -196,7 +196,7 @@ class FixRxKokkos : public FixRX {
                   double& h0, VectorType& y, VectorType& rwk, UserDataType& userData) const;
 
   //!< ODE Solver diagnostics.
-  void odeDiagnostics(void);
+  void odeDiagnostics();
 
   //!< Special counters per-ode.
   int *diagnosticCounterPerODEnSteps;
@@ -231,7 +231,7 @@ class FixRxKokkos : public FixRX {
 
   bool update_kinetics_data;
 
-  void create_kinetics_data(void);
+  void create_kinetics_data();
 
   // Need a dual-view and device-view for dpdThetaLocal and sumWeights since they're used in several callbacks.
   DAT::tdual_efloat_1d k_dpdThetaLocal, k_sumWeights;
diff --git a/src/KOKKOS/pair_eam_alloy_kokkos.cpp b/src/KOKKOS/pair_eam_alloy_kokkos.cpp
index a8e65c4a92..9421946c3e 100644
--- a/src/KOKKOS/pair_eam_alloy_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_alloy_kokkos.cpp
@@ -17,23 +17,23 @@
 ------------------------------------------------------------------------- */
 
 #include "pair_eam_alloy_kokkos.h"
-#include <cmath>
-#include <cstring>
-#include "kokkos.h"
-#include "pair_kokkos.h"
+
 #include "atom_kokkos.h"
-#include "force.h"
+#include "atom_masks.h"
 #include "comm.h"
-#include "neighbor.h"
+#include "error.h"
+#include "force.h"
+#include "kokkos.h"
+#include "memory_kokkos.h"
 #include "neigh_list_kokkos.h"
 #include "neigh_request.h"
-#include "memory_kokkos.h"
-#include "error.h"
-#include "atom_masks.h"
-
-#include "tokenizer.h"
+#include "neighbor.h"
+#include "pair_kokkos.h"
 #include "potential_file_reader.h"
 
+#include <cmath>
+#include <cstring>
+
 using namespace LAMMPS_NS;
 
 // Cannot use virtual inheritance on the GPU, so must duplicate code
@@ -44,8 +44,8 @@ template<class DeviceType>
 PairEAMAlloyKokkos<DeviceType>::PairEAMAlloyKokkos(LAMMPS *lmp) : PairEAM(lmp)
 {
   respa_enable = 0;
+  single_enable = 0;
   one_coeff = 1;
-  manybody_flag = 1;
 
   kokkosable = 1;
   atomKK = (AtomKokkos *) atom;
@@ -261,6 +261,8 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     virial[5] += ev.v[5];
   }
 
+  if (vflag_fdotr) pair_virial_fdotr_compute(this);
+
   if (eflag_atom) {
     if (need_dup)
       Kokkos::Experimental::contribute(d_eatom, dup_eatom);
@@ -275,8 +277,6 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     k_vatom.template sync<LMPHostType>();
   }
 
-  if (vflag_fdotr) pair_virial_fdotr_compute(this);
-
   copymode = 0;
 
   // free duplicated memory
@@ -322,6 +322,11 @@ void PairEAMAlloyKokkos<DeviceType>::init_style()
 
 }
 
+/* ----------------------------------------------------------------------
+   convert read-in funcfl potential(s) to standard array format
+   interpolate all file values to a single grid and cutoff
+------------------------------------------------------------------------- */
+
 template<class DeviceType>
 void PairEAMAlloyKokkos<DeviceType>::file2array()
 {
@@ -524,7 +529,7 @@ void PairEAMAlloyKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, doubl
     h_rho[j] += buf[m++];
   }
 
-  k_fp.modify_host();
+  k_rho.modify_host();
 }
 
 /* ---------------------------------------------------------------------- */
@@ -581,8 +586,8 @@ void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelA<NEIGHFLAG
                   d_rhor_spline(d_type2rhor_ji,m,5))*p + d_rhor_spline(d_type2rhor_ji,m,6);
       if (NEWTON_PAIR || j < nlocal) {
         const int d_type2rhor_ij = d_type2rhor(itype,jtype);
-       a_rho[j] += ((d_rhor_spline(d_type2rhor_ij,m,3)*p + d_rhor_spline(d_type2rhor_ij,m,4))*p +
-                    d_rhor_spline(d_type2rhor_ij,m,5))*p + d_rhor_spline(d_type2rhor_ij,m,6);
+        a_rho[j] += ((d_rhor_spline(d_type2rhor_ij,m,3)*p + d_rhor_spline(d_type2rhor_ij,m,4))*p +
+                      d_rhor_spline(d_type2rhor_ij,m,5))*p + d_rhor_spline(d_type2rhor_ij,m,6);
       }
     }
 
@@ -597,7 +602,6 @@ template<class DeviceType>
 template<int EFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelB<EFLAG>, const int &ii, EV_FLOAT& ev) const {
-
   // fp = derivative of embedding energy at each atom
   // phi = embedding energy at each atom
   // if rho > rhomax (e.g. due to close approach of two atoms),
@@ -620,7 +624,6 @@ void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelB<EFLAG>, c
     if (eflag_global) ev.evdwl += phi;
     if (eflag_atom) d_eatom[i] += phi;
   }
-
 }
 
 template<class DeviceType>
diff --git a/src/KOKKOS/pair_eam_fs_kokkos.cpp b/src/KOKKOS/pair_eam_fs_kokkos.cpp
index b12de79b37..5fbd14d8b3 100644
--- a/src/KOKKOS/pair_eam_fs_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_fs_kokkos.cpp
@@ -17,23 +17,23 @@
 ------------------------------------------------------------------------- */
 
 #include "pair_eam_fs_kokkos.h"
-#include <cmath>
-#include <cstring>
-#include "kokkos.h"
-#include "pair_kokkos.h"
+
 #include "atom_kokkos.h"
-#include "force.h"
+#include "atom_masks.h"
 #include "comm.h"
-#include "neighbor.h"
+#include "error.h"
+#include "force.h"
+#include "kokkos.h"
+#include "memory_kokkos.h"
 #include "neigh_list_kokkos.h"
 #include "neigh_request.h"
-#include "memory_kokkos.h"
-#include "error.h"
-#include "atom_masks.h"
-
-#include "tokenizer.h"
+#include "neighbor.h"
+#include "pair_kokkos.h"
 #include "potential_file_reader.h"
 
+#include <cmath>
+#include <cstring>
+
 using namespace LAMMPS_NS;
 
 // Cannot use virtual inheritance on the GPU, so must duplicate code
@@ -43,9 +43,9 @@ using namespace LAMMPS_NS;
 template<class DeviceType>
 PairEAMFSKokkos<DeviceType>::PairEAMFSKokkos(LAMMPS *lmp) : PairEAM(lmp)
 {
-  one_coeff = 1;
-  manybody_flag = 1;
   respa_enable = 0;
+  single_enable = 0;
+  one_coeff = 1;
 
   kokkosable = 1;
   atomKK = (AtomKokkos *) atom;
@@ -200,9 +200,9 @@ void PairEAMFSKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
   // communicate derivative of embedding function (on the device)
 
-  k_fp.template sync<DeviceType>();
-  comm->forward_comm_pair(this);
   k_fp.template modify<DeviceType>();
+  comm->forward_comm_pair(this);
+  k_fp.template sync<DeviceType>();
 
   // compute kernel C
 
@@ -322,6 +322,11 @@ void PairEAMFSKokkos<DeviceType>::init_style()
 
 }
 
+/* ----------------------------------------------------------------------
+   convert read-in funcfl potential(s) to standard array format
+   interpolate all file values to a single grid and cutoff
+------------------------------------------------------------------------- */
+
 template<class DeviceType>
 void PairEAMFSKokkos<DeviceType>::file2array()
 {
@@ -581,8 +586,8 @@ void PairEAMFSKokkos<DeviceType>::operator()(TagPairEAMFSKernelA<NEIGHFLAG,NEWTO
                   d_rhor_spline(d_type2rhor_ji,m,5))*p + d_rhor_spline(d_type2rhor_ji,m,6);
       if (NEWTON_PAIR || j < nlocal) {
         const int d_type2rhor_ij = d_type2rhor(itype,jtype);
-       a_rho[j] += ((d_rhor_spline(d_type2rhor_ij,m,3)*p + d_rhor_spline(d_type2rhor_ij,m,4))*p +
-                    d_rhor_spline(d_type2rhor_ij,m,5))*p + d_rhor_spline(d_type2rhor_ij,m,6);
+        a_rho[j] += ((d_rhor_spline(d_type2rhor_ij,m,3)*p + d_rhor_spline(d_type2rhor_ij,m,4))*p +
+                      d_rhor_spline(d_type2rhor_ij,m,5))*p + d_rhor_spline(d_type2rhor_ij,m,6);
       }
     }
 
@@ -620,7 +625,6 @@ void PairEAMFSKokkos<DeviceType>::operator()(TagPairEAMFSKernelB<EFLAG>, const i
     if (eflag_global) ev.evdwl += phi;
     if (eflag_atom) d_eatom[i] += phi;
   }
-
 }
 
 template<class DeviceType>
diff --git a/src/KOKKOS/pair_eam_kokkos.cpp b/src/KOKKOS/pair_eam_kokkos.cpp
index c9d2808075..417efc3f7d 100644
--- a/src/KOKKOS/pair_eam_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_kokkos.cpp
@@ -17,18 +17,20 @@
 ------------------------------------------------------------------------- */
 
 #include "pair_eam_kokkos.h"
-#include <cmath>
-#include "kokkos.h"
-#include "pair_kokkos.h"
+
 #include "atom_kokkos.h"
-#include "force.h"
+#include "atom_masks.h"
 #include "comm.h"
-#include "neighbor.h"
+#include "error.h"
+#include "force.h"
+#include "kokkos.h"
+#include "memory_kokkos.h"
 #include "neigh_list_kokkos.h"
 #include "neigh_request.h"
-#include "memory_kokkos.h"
-#include "error.h"
-#include "atom_masks.h"
+#include "neighbor.h"
+#include "pair_kokkos.h"
+
+#include <cmath>
 
 using namespace LAMMPS_NS;
 
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
index 4d20c59482..40917d832e 100644
--- a/src/KOKKOS/pair_exp6_rx_kokkos.h
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -45,7 +45,7 @@ struct PairExp6ParamDataTypeKokkos
           epsilonOld2, alphaOld2, rmOld2, mixWtSite2old;
 
    // Default constructor -- nullify everything.
-   PairExp6ParamDataTypeKokkos<DeviceType>(void)
+   PairExp6ParamDataTypeKokkos()
       : n(0)
    {}
 };
@@ -63,7 +63,7 @@ struct PairExp6ParamDataTypeKokkosVect
                            nTotalold;
 
    // Default constructor -- nullify everything.
-   PairExp6ParamDataTypeKokkosVect<DeviceType>(void)
+   PairExp6ParamDataTypeKokkosVect()
    {}
 };
 
diff --git a/src/MAKE/MACHINES/Makefile.aarch64_g++_openmpi_armpl b/src/MAKE/MACHINES/Makefile.aarch64_g++_openmpi_armpl
index 4174c9c5e7..2ebd2ac744 100644
--- a/src/MAKE/MACHINES/Makefile.aarch64_g++_openmpi_armpl
+++ b/src/MAKE/MACHINES/Makefile.aarch64_g++_openmpi_armpl
@@ -8,12 +8,12 @@ SHELL = /bin/sh
 
 export OMPI_CXX = g++
 CC =		mpicxx
-CCFLAGS =	-O3 -march=native -mcpu=native
+CCFLAGS =	-O3 -march=native -mcpu=native -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx
-LINKFLAGS =	-O
+LINKFLAGS =	-O -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/MACHINES/Makefile.aarch64_g++_serial_armpl b/src/MAKE/MACHINES/Makefile.aarch64_g++_serial_armpl
index 5cb6fa0cde..054b530bc8 100644
--- a/src/MAKE/MACHINES/Makefile.aarch64_g++_serial_armpl
+++ b/src/MAKE/MACHINES/Makefile.aarch64_g++_serial_armpl
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		g++
-CCFLAGS =	-O3 -march=native -mcpu=native
+CCFLAGS =	-O3 -march=native -mcpu=native -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		g++
-LINKFLAGS =	-O
+LINKFLAGS =	-O -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/MACHINES/Makefile.cygwin b/src/MAKE/MACHINES/Makefile.cygwin
index 4c47860a56..3d4a50a8ea 100644
--- a/src/MAKE/MACHINES/Makefile.cygwin
+++ b/src/MAKE/MACHINES/Makefile.cygwin
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		mpicxx
-CCFLAGS =	-O
+CCFLAGS =	-O2  -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx
-LINKFLAGS =	-O
+LINKFLAGS =	-O -std=c++11
 LIB =           
 SIZE =		size
 
diff --git a/src/MAKE/MACHINES/Makefile.mac b/src/MAKE/MACHINES/Makefile.mac
index 67381fe622..fb749b5759 100644
--- a/src/MAKE/MACHINES/Makefile.mac
+++ b/src/MAKE/MACHINES/Makefile.mac
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		c++
-CCFLAGS =	-O
+CCFLAGS =	-O -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		c++
-LINKFLAGS =	-O
+LINKFLAGS =	-O -std=c++11
 LIB =           
 SIZE =		size
 
diff --git a/src/MAKE/MACHINES/Makefile.mac_mpi b/src/MAKE/MACHINES/Makefile.mac_mpi
index 0b6b4b5ba2..4718c94e51 100644
--- a/src/MAKE/MACHINES/Makefile.mac_mpi
+++ b/src/MAKE/MACHINES/Makefile.mac_mpi
@@ -8,12 +8,12 @@ SHELL = /bin/sh
 # unless additional compiler/linker flags or libraries needed for your machine
 
 CC =	 	/opt/local/bin/mpicxx-openmpi-mp
-CCFLAGS =	-O3 
+CCFLAGS =	-O3  -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		/opt/local/bin/mpicxx-openmpi-mp
-LINKFLAGS =	-O3
+LINKFLAGS =	-O3 -std=c++11
 LIB =           
 SIZE =		size
 
diff --git a/src/MAKE/MACHINES/Makefile.ubuntu b/src/MAKE/MACHINES/Makefile.ubuntu
index 6c419ffdfa..f030ce64df 100644
--- a/src/MAKE/MACHINES/Makefile.ubuntu
+++ b/src/MAKE/MACHINES/Makefile.ubuntu
@@ -11,12 +11,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		mpic++
-CCFLAGS =	-g -O3 # -Wunused
+CCFLAGS =	-g -O3  -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpic++
-LINKFLAGS =	-g -O3
+LINKFLAGS =	-g -O3 -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/MACHINES/Makefile.ubuntu_simple b/src/MAKE/MACHINES/Makefile.ubuntu_simple
index 98897f964f..e8b58fc804 100644
--- a/src/MAKE/MACHINES/Makefile.ubuntu_simple
+++ b/src/MAKE/MACHINES/Makefile.ubuntu_simple
@@ -10,12 +10,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		mpic++
-CCFLAGS =	-g -O3 # -Wunused
+CCFLAGS =	-g -O3  -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpic++
-LINKFLAGS =	-g -O3
+LINKFLAGS =	-g -O3 -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/Makefile.mpi b/src/MAKE/Makefile.mpi
index 9776b0153e..42f48b4e2c 100644
--- a/src/MAKE/Makefile.mpi
+++ b/src/MAKE/Makefile.mpi
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		mpicxx
-CCFLAGS =	-g -O3
+CCFLAGS =	-g -O3 -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx
-LINKFLAGS =	-g -O3
+LINKFLAGS =	-g -O3 -std=c++11
 LIB =
 SIZE =		size
 
diff --git a/src/MAKE/Makefile.serial b/src/MAKE/Makefile.serial
index 0f5952f317..b527919147 100644
--- a/src/MAKE/Makefile.serial
+++ b/src/MAKE/Makefile.serial
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		g++
-CCFLAGS =	-g -O3
+CCFLAGS =	-g -O3 -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		g++
-LINKFLAGS =	-g -O
+LINKFLAGS =	-g -O -std=c++11
 LIB =
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.g++_mpich b/src/MAKE/OPTIONS/Makefile.g++_mpich
index 4ea855cfeb..e0c77437f5 100644
--- a/src/MAKE/OPTIONS/Makefile.g++_mpich
+++ b/src/MAKE/OPTIONS/Makefile.g++_mpich
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		mpicxx -cxx=g++
-CCFLAGS =	-g -O3
+CCFLAGS =	-g -O3 -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx -cxx=g++
-LINKFLAGS =	-g -O
+LINKFLAGS =	-g -O -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.g++_mpich_link b/src/MAKE/OPTIONS/Makefile.g++_mpich_link
index 7b92a3e77a..4f2855a9cc 100644
--- a/src/MAKE/OPTIONS/Makefile.g++_mpich_link
+++ b/src/MAKE/OPTIONS/Makefile.g++_mpich_link
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		g++
-CCFLAGS =	-g -O3
+CCFLAGS =	-g -O3 -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		g++
-LINKFLAGS =	-g -O
+LINKFLAGS =	-g -O -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.g++_openmpi_link b/src/MAKE/OPTIONS/Makefile.g++_openmpi_link
index 6fc71fe2a5..0c9997dbb0 100644
--- a/src/MAKE/OPTIONS/Makefile.g++_openmpi_link
+++ b/src/MAKE/OPTIONS/Makefile.g++_openmpi_link
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		g++
-CCFLAGS =	-g -O3
+CCFLAGS =	-g -O3 -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		g++
-LINKFLAGS =	-g -O
+LINKFLAGS =	-g -O -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.g++_serial b/src/MAKE/OPTIONS/Makefile.g++_serial
index 4f6f0afe22..d6b9bf3221 100644
--- a/src/MAKE/OPTIONS/Makefile.g++_serial
+++ b/src/MAKE/OPTIONS/Makefile.g++_serial
@@ -6,13 +6,13 @@ SHELL = /bin/sh
 # compiler/linker settings
 # specify flags and libraries needed for your compiler
 
-CC =		g++ -std=c++11
-CCFLAGS =	-g -O3
+CC =		g++
+CCFLAGS =	-g -O3 -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
-LINK =		g++ -std=c++11
-LINKFLAGS =	-g -O
+LINK =		g++
+LINKFLAGS =	-g -O -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.gpu b/src/MAKE/OPTIONS/Makefile.gpu
index 26c98c120d..9ad5cf477c 100644
--- a/src/MAKE/OPTIONS/Makefile.gpu
+++ b/src/MAKE/OPTIONS/Makefile.gpu
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		mpicxx
-CCFLAGS =	-g -O3 
+CCFLAGS =	-g -O3 -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx
-LINKFLAGS =	-g -O
+LINKFLAGS =	-g -O -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.icc_mpich b/src/MAKE/OPTIONS/Makefile.icc_mpich
index cf76506da5..c630c42c26 100644
--- a/src/MAKE/OPTIONS/Makefile.icc_mpich
+++ b/src/MAKE/OPTIONS/Makefile.icc_mpich
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		mpicxx -cxx=icc
-CCFLAGS =	-g -O3 -restrict
+CCFLAGS =	-g -O3 -restrict -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx -cxx=icc
-LINKFLAGS =	-g -O
+LINKFLAGS =	-g -O -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.icc_mpich_link b/src/MAKE/OPTIONS/Makefile.icc_mpich_link
index 3994968430..8b89d2509a 100644
--- a/src/MAKE/OPTIONS/Makefile.icc_mpich_link
+++ b/src/MAKE/OPTIONS/Makefile.icc_mpich_link
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		icc
-CCFLAGS =	-g -O3 -restrict
+CCFLAGS =	-g -O3 -restrict -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		icc
-LINKFLAGS =	-g -O
+LINKFLAGS =	-g -O -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.icc_openmpi b/src/MAKE/OPTIONS/Makefile.icc_openmpi
index 72e3d44093..0a2c9598a6 100644
--- a/src/MAKE/OPTIONS/Makefile.icc_openmpi
+++ b/src/MAKE/OPTIONS/Makefile.icc_openmpi
@@ -8,12 +8,12 @@ SHELL = /bin/sh
 
 export OMPI_CXX = icc
 CC =		mpicxx
-CCFLAGS =	-g -O3 -restrict
+CCFLAGS =	-g -O3 -restrict -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx
-LINKFLAGS =	-g -O
+LINKFLAGS =	-g -O -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.icc_openmpi_link b/src/MAKE/OPTIONS/Makefile.icc_openmpi_link
index e44486aeb5..825d4cdff0 100644
--- a/src/MAKE/OPTIONS/Makefile.icc_openmpi_link
+++ b/src/MAKE/OPTIONS/Makefile.icc_openmpi_link
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		icc
-CCFLAGS =	-g -O3 -restrict
+CCFLAGS =	-g -O3 -restrict -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		icc
-LINKFLAGS =	-g -O
+LINKFLAGS =	-g -O -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.icc_serial b/src/MAKE/OPTIONS/Makefile.icc_serial
index a81c73c718..2d2da54c68 100644
--- a/src/MAKE/OPTIONS/Makefile.icc_serial
+++ b/src/MAKE/OPTIONS/Makefile.icc_serial
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		icc
-CCFLAGS =	-g -O3 -restrict
+CCFLAGS =	-g -O3 -restrict -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		icc
-LINKFLAGS =	-g -O
+LINKFLAGS =	-g -O -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.jpeg b/src/MAKE/OPTIONS/Makefile.jpeg
index e8f1f3e96a..268e7b94e8 100644
--- a/src/MAKE/OPTIONS/Makefile.jpeg
+++ b/src/MAKE/OPTIONS/Makefile.jpeg
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		mpicxx
-CCFLAGS =	-g -O3 
+CCFLAGS =	-g -O3  -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx
-LINKFLAGS =	-g -O
+LINKFLAGS =	-g -O -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.omp b/src/MAKE/OPTIONS/Makefile.omp
index 0f49cdb15c..573c2d826b 100644
--- a/src/MAKE/OPTIONS/Makefile.omp
+++ b/src/MAKE/OPTIONS/Makefile.omp
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		mpicxx
-CCFLAGS =	-g -O3 -restrict -fopenmp
+CCFLAGS =	-g -O3 -restrict -fopenmp -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx
-LINKFLAGS =	-g -O -fopenmp
+LINKFLAGS =	-g -O -fopenmp -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.opt b/src/MAKE/OPTIONS/Makefile.opt
index 8919e6e1d9..2cb5540fd4 100644
--- a/src/MAKE/OPTIONS/Makefile.opt
+++ b/src/MAKE/OPTIONS/Makefile.opt
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		mpicxx
-CCFLAGS =	-g -O3 -restrict
+CCFLAGS =	-g -O3 -restrict -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx
-LINKFLAGS =	-g -O
+LINKFLAGS =	-g -O -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.png b/src/MAKE/OPTIONS/Makefile.png
index 9fd7b9b79c..40ebe43d32 100644
--- a/src/MAKE/OPTIONS/Makefile.png
+++ b/src/MAKE/OPTIONS/Makefile.png
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		mpicxx
-CCFLAGS =	-g -O3 
+CCFLAGS =	-g -O3  -std=c++11
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx
-LINKFLAGS =	-g -O
+LINKFLAGS =	-g -O -std=c++11
 LIB = 
 SIZE =		size
 
diff --git a/src/MPIIO/dump_xyz_mpiio.cpp b/src/MPIIO/dump_xyz_mpiio.cpp
index 24fd130e3d..f322a0da58 100644
--- a/src/MPIIO/dump_xyz_mpiio.cpp
+++ b/src/MPIIO/dump_xyz_mpiio.cpp
@@ -16,17 +16,18 @@
    Contributing author: Paul Coffman (IBM)
 ------------------------------------------------------------------------- */
 
-#include "omp_compat.h"
 #include "dump_xyz_mpiio.h"
-#include <cmath>
 
-#include <cstring>
-#include "domain.h"
-#include "update.h"
 #include "compute.h"
-#include "memory.h"
+#include "domain.h"
 #include "error.h"
+#include "memory.h"
+#include "update.h"
 
+#include <cmath>
+#include <cstring>
+
+#include "omp_compat.h"
 #if defined(_OPENMP)
 #include <omp.h>
 #endif
diff --git a/src/MSCG/fix_mscg.cpp b/src/MSCG/fix_mscg.cpp
index 8387ff4ebf..1aa644b976 100644
--- a/src/MSCG/fix_mscg.cpp
+++ b/src/MSCG/fix_mscg.cpp
@@ -140,14 +140,14 @@ void FixMSCG::post_constructor()
   tagint *tag = atom->tag;
   int *type = atom->type;
   int *num_bond = atom->num_bond;
-  int **bond_atom = atom->bond_atom;
+  tagint **bond_atom = atom->bond_atom;
   int *num_angle = atom->num_angle;
-  int **angle_atom1 = atom->angle_atom1;
-  int **angle_atom3 = atom->angle_atom3;
+  tagint **angle_atom1 = atom->angle_atom1;
+  tagint **angle_atom3 = atom->angle_atom3;
   int *num_dihedral = atom->num_dihedral;
-  int **dihedral_atom1 = atom->dihedral_atom1;
-  int **dihedral_atom3 = atom->dihedral_atom3;
-  int **dihedral_atom4 = atom->dihedral_atom4;
+  tagint **dihedral_atom1 = atom->dihedral_atom1;
+  tagint **dihedral_atom3 = atom->dihedral_atom3;
+  tagint **dihedral_atom4 = atom->dihedral_atom4;
   double *prd_half = domain->prd_half;
   int i,ii,j,jj,jnum,k,l;
 
diff --git a/src/Makefile b/src/Makefile
index 4e6fb5cea9..7f02c1e84b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -260,12 +260,11 @@ uppercase_internal=$(if $1,$$(subst $(firstword $1),$(call uppercase_internal,$(
 uppercase=$(eval uppercase_RESULT:=$(call uppercase_internal,$(uppercase_TABLE),$1))$(uppercase_RESULT)
 
 PACKAGEUC = $(call uppercase,$(PACKAGE))
-PACKUSERUC = $(call uppercase,$(PACKUSER))
+PACKAGESORTED = $(sort $(PACKAGEUC))
 
 YESDIR = $(call uppercase,$(@:yes-%=%))
 NODIR  = $(call uppercase,$(@:no-%=%))
 LIBDIR = $(@:lib-%=%)
-LIBUSERDIR = $(@:lib-%=%)
 
 # List of all targets
 
@@ -332,7 +331,7 @@ lmpinstalledpkgs.h: $(SRC) $(INC)
 	@echo '#ifndef LMP_INSTALLED_PKGS_H' >  ${TMPNAME}.lmpinstalled
 	@echo '#define LMP_INSTALLED_PKGS_H' >> ${TMPNAME}.lmpinstalled
 	@echo 'const char * LAMMPS_NS::LAMMPS::installed_packages[] = {' >> ${TMPNAME}.lmpinstalled
-	@for p in $(PACKAGEUC) $(PACKUSERUC); do info=$$($(SHELL) Package.sh $$p installed); \
+	@for p in $(PACKAGEUC); do info=$$($(SHELL) Package.sh $$p installed); \
              [ -n "$$info" ] && echo "\"$$info\"" | sed -e 's/".*package \(.*\)"/"\1",/' >> ${TMPNAME}.lmpinstalled || :; done
 	@echo ' NULL };' >> ${TMPNAME}.lmpinstalled
 	@echo '#endif' >> ${TMPNAME}.lmpinstalled
@@ -469,7 +468,7 @@ tar:
 	@cd ..; tar cvzf src/$(ROOT)_src.tar.gz \
 	  src/Make* src/Package.sh src/Depend.sh src/Install.sh src/Fetch.sh \
 	  src/MAKE src/DEPEND src/*.cpp src/*.h src/STUBS \
-	  $(patsubst %,src/%,$(PACKAGEUC)) $(patsubst %,src/%,$(PACKUSERUC)) \
+	  $(patsubst %,src/%,$(PACKAGEUC)) \
           --exclude=*/.svn
 	@cd STUBS; $(MAKE)
 	@echo "Created $(ROOT)_src.tar.gz"
@@ -502,9 +501,7 @@ format-tests:
 # Package management
 
 package:
-	@echo 'Standard packages:' $(PACKAGE)
-	@echo ''
-	@echo 'User-contributed packages:' $(PACKUSER)
+	@echo 'Available packages:' $(PACKAGE)
 	@echo ''
 	@echo 'Packages that need system libraries:' $(PACKSYS)
 	@echo ''
@@ -615,9 +612,6 @@ lib-%:
 	@if [ -e ../lib/$(LIBDIR)/Install.py ]; then \
 	  echo "Installing lib $(@:lib-%=%)"; \
 	  ( cd ../lib/$(LIBDIR); $(PYTHON) Install.py $(args) ); \
-	elif [ -e ../lib/$(LIBUSERDIR)/Install.py ]; then \
-	  echo "Installing lib $(@:lib-%=%)"; \
-	  ( cd ../lib/$(LIBUSERDIR); $(PYTHON) Install.py $(args) ); \
 	else \
 	  echo "Install script for lib $(@:lib-%=%) does not exist"; \
 	fi; touch main.cpp
@@ -630,28 +624,21 @@ lib-%:
 # purge = delete obsolete and auto-generated package files
 
 package-status ps:
-	@for p in $(PACKAGEUC); do $(SHELL) Package.sh $$p status; done
-	@echo ''
-	@for p in $(PACKUSERUC); do $(SHELL) Package.sh $$p status; done
+	@for p in $(PACKAGESORTED); do $(SHELL) Package.sh $$p status; done
 
 package-installed pi:
-	@for p in $(PACKAGEUC); do $(SHELL) Package.sh $$p installed; done
-	@for p in $(PACKUSERUC); do $(SHELL) Package.sh $$p installed; done
+	@for p in $(PACKAGESORTED); do $(SHELL) Package.sh $$p installed; done
 
 package-update pu: purge
+	@echo 'Updating installed packages:'
 	@for p in $(PACKAGEUC); do $(SHELL) Package.sh $$p update; done
-	@echo ''
-	@for p in $(PACKUSERUC); do $(SHELL) Package.sh $$p update; done
 
 package-overwrite: purge
-	@for p in $(PACKAGEUC); do $(SHELL) Package.sh $$p overwrite; done
-	@echo ''
-	@for p in $(PACKUSERUC); do $(SHELL) Package.sh $$p overwrite; done
+	@echo 'Overwriting installed packages:'
+	@for p in $(PACKAGESORTED); do $(SHELL) Package.sh $$p overwrite; done
 
 package-diff pd:
-	@for p in $(PACKAGEUC); do $(SHELL) Package.sh $$p diff; done
-	@echo ''
-	@for p in $(PACKUSERUC); do $(SHELL) Package.sh $$p diff; done
+	@for p in $(PACKAGESORTED); do $(SHELL) Package.sh $$p diff; done
 
 purge: Purge.list
 	@echo 'Purging obsolete and auto-generated source files'
diff --git a/src/Package.sh b/src/Package.sh
index f776a02e48..aa217fb555 100755
--- a/src/Package.sh
+++ b/src/Package.sh
@@ -45,9 +45,8 @@ elif (test $2 = "installed") then
 # perform a re-install, but only if the package is already installed
 
 elif (test $2 = "update") then
-  echo "Updating src files from $1 package files"
   if (test $installed = 1) then
-    echo "  updating package $1"
+     echo "Updating src files from $1 package files"
     if (test -e Install.sh) then
       /bin/sh Install.sh 2
     else
@@ -55,16 +54,14 @@ elif (test $2 = "update") then
     fi
     cd ..
     /bin/sh Depend.sh $1
-  else
-    echo "  $1 package is not installed"
   fi
 
 # overwrite, only if installed
 # overwrite package file with src file, if the two are different
 
 elif (test $2 = "overwrite") then
-  echo "Overwriting $1 package files with src files"
   if (test $installed = 1) then
+     echo "Overwriting $1 package files with src files"
     for file in *.cpp *.h; do
       if (test ! -e ../$file) then
         continue
diff --git a/src/REACTION/fix_bond_react.cpp b/src/REACTION/fix_bond_react.cpp
index ac009821cf..69c9c87ddf 100644
--- a/src/REACTION/fix_bond_react.cpp
+++ b/src/REACTION/fix_bond_react.cpp
@@ -910,7 +910,8 @@ void FixBondReact::post_integrate()
 
   int j;
   for (rxnID = 0; rxnID < nreacts; rxnID++) {
-    if (max_rxn[rxnID] <= reaction_count_total[rxnID]) continue;
+    if ((update->ntimestep % nevery[rxnID]) ||
+        (max_rxn[rxnID] <= reaction_count_total[rxnID])) continue;
     for (int ii = 0; ii < nall; ii++) {
       partner[ii] = 0;
       finalpartner[ii] = 0;
diff --git a/src/compute_angle_local.cpp b/src/compute_angle_local.cpp
index 7401d8b214..2bceb91dd5 100644
--- a/src/compute_angle_local.cpp
+++ b/src/compute_angle_local.cpp
@@ -194,7 +194,7 @@ void ComputeAngleLocal::compute_local()
 
 int ComputeAngleLocal::compute_angles(int flag)
 {
-  int i,m,n,na,atom1,atom2,atom3,imol,iatom,atype,ivar;
+  int i,m,na,atom1,atom2,atom3,imol,iatom,atype,ivar;
   tagint tagprev;
   double delx1,dely1,delz1,delx2,dely2,delz2;
   double rsq1,rsq2,r1,r2,c,theta;
diff --git a/src/dump_atom.cpp b/src/dump_atom.cpp
index f3a03735a3..0dbd3b3278 100644
--- a/src/dump_atom.cpp
+++ b/src/dump_atom.cpp
@@ -560,7 +560,8 @@ void DumpAtom::write_binary(int n, double *mybuf)
 
 void DumpAtom::write_string(int n, double *mybuf)
 {
-  fwrite(mybuf,sizeof(char),n,fp);
+  if (mybuf)
+    fwrite(mybuf,sizeof(char),n,fp);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/dump_cfg.cpp b/src/dump_cfg.cpp
index 28f5a35a4b..8e87f4104e 100644
--- a/src/dump_cfg.cpp
+++ b/src/dump_cfg.cpp
@@ -233,7 +233,8 @@ void DumpCFG::write_data(int n, double *mybuf)
 
 void DumpCFG::write_string(int n, double *mybuf)
 {
-  fwrite(mybuf,sizeof(char),n,fp);
+  if (mybuf)
+    fwrite(mybuf,sizeof(char),n,fp);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/dump_custom.cpp b/src/dump_custom.cpp
index 16c67927bf..b2acdbfc51 100644
--- a/src/dump_custom.cpp
+++ b/src/dump_custom.cpp
@@ -1234,7 +1234,8 @@ void DumpCustom::write_binary(int n, double *mybuf)
 
 void DumpCustom::write_string(int n, double *mybuf)
 {
-  fwrite(mybuf,sizeof(char),n,fp);
+  if (mybuf)
+    fwrite(mybuf,sizeof(char),n,fp);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/dump_local.cpp b/src/dump_local.cpp
index f9a970d2b2..96d8944e2f 100644
--- a/src/dump_local.cpp
+++ b/src/dump_local.cpp
@@ -399,7 +399,8 @@ void DumpLocal::write_data(int n, double *mybuf)
 
 void DumpLocal::write_string(int n, double *mybuf)
 {
-  fwrite(mybuf,sizeof(char),n,fp);
+  if (mybuf)
+    fwrite(mybuf,sizeof(char),n,fp);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/dump_xyz.cpp b/src/dump_xyz.cpp
index ebbd432f5d..e009937959 100644
--- a/src/dump_xyz.cpp
+++ b/src/dump_xyz.cpp
@@ -13,12 +13,14 @@
 ------------------------------------------------------------------------- */
 
 #include "dump_xyz.h"
-#include <cstring>
+
 #include "atom.h"
 #include "error.h"
 #include "memory.h"
 #include "update.h"
 
+#include <cstring>
+
 using namespace LAMMPS_NS;
 
 #define ONELINE 128
@@ -194,7 +196,8 @@ void DumpXYZ::write_data(int n, double *mybuf)
 
 void DumpXYZ::write_string(int n, double *mybuf)
 {
-  fwrite(mybuf,sizeof(char),n,fp);
+  if (mybuf)
+    fwrite(mybuf,sizeof(char),n,fp);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/fix_dt_reset.cpp b/src/fix_dt_reset.cpp
index 409729c242..c80c976504 100644
--- a/src/fix_dt_reset.cpp
+++ b/src/fix_dt_reset.cpp
@@ -171,7 +171,7 @@ void FixDtReset::end_of_step()
       if (vsq > 0.0) dtv = xmax / sqrt(vsq);
       if (fsq > 0.0) dtf = sqrt(2.0 * xmax / (ftm2v * sqrt(fsq) * massinv));
       dt = MIN(dtv, dtf);
-      if (emax > 0.0 && vsq > 0.0 && fsq > 0.0) {
+      if ((emax > 0.0) && (fsq * vsq > 0.0)) {
         dte = emax / sqrt(fsq * vsq) / sqrt(ftm2v * mvv2e);
         dt = MIN(dt, dte);
       }
diff --git a/src/neighbor.cpp b/src/neighbor.cpp
index c88885fc54..74bb3bf762 100644
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@@ -915,11 +915,14 @@ int Neighbor::init_pair()
     requests[i]->index_bin = -1;
     flag = lists[i]->bin_method;
     if (flag == 0) continue;
-    for (j = 0; j < nbin; j++)
-      if (neigh_bin[j]->istyle == flag) break;
-    if (j < nbin && !requests[i]->unique) {
-      requests[i]->index_bin = j;
-      continue;
+    if (!requests[i]->unique) {
+      for (j = 0; j < nbin; j++)
+        if (neigh_bin[j]->istyle == flag &&
+            neigh_bin[j]->cutoff_custom == 0.0) break;
+      if (j < nbin) {
+        requests[i]->index_bin = j;
+        continue;
+      }
     }
 
     BinCreator &bin_creator = binclass[flag-1];
@@ -936,11 +939,14 @@ int Neighbor::init_pair()
     requests[i]->index_stencil = -1;
     flag = lists[i]->stencil_method;
     if (flag == 0) continue;
-    for (j = 0; j < nstencil; j++)
-      if (neigh_stencil[j]->istyle == flag) break;
-    if (j < nstencil && !requests[i]->unique) {
-      requests[i]->index_stencil = j;
-      continue;
+    if (!requests[i]->unique) {
+      for (j = 0; j < nstencil; j++)
+        if (neigh_stencil[j]->istyle == flag &&
+            neigh_stencil[j]->cutoff_custom == 0.0) break;
+      if (j < nstencil) {
+        requests[i]->index_stencil = j;
+        continue;
+      }
     }
 
     StencilCreator &stencil_creator = stencilclass[flag-1];
@@ -2515,6 +2521,7 @@ void Neighbor::modify_params(int narg, char **arg)
       int i;
 
       // Invalidate old user cutoffs
+
       comm->ncollections_cutoff = 0;
       interval_collection_flag = 1;
       custom_collection_flag = 1;
@@ -2546,9 +2553,10 @@ void Neighbor::modify_params(int narg, char **arg)
         error->all(FLERR,"Invalid collection/type command");
 
       int ntypes = atom->ntypes;
-      int n, nlo, nhi, i, j, k;
+      int nlo, nhi, i, k;
 
       // Invalidate old user cutoffs
+
       comm->ncollections_cutoff = 0;
       interval_collection_flag = 0;
       custom_collection_flag = 1;
@@ -2556,10 +2564,12 @@ void Neighbor::modify_params(int narg, char **arg)
         memory->create(type2collection,ntypes+1,"neigh:type2collection");
 
       // Erase previous mapping
+
       for (i = 1; i <= ntypes; i++)
         type2collection[i] = -1;
 
       // For each custom range, define mapping for types in interval
+
       for (i = 0; i < ncollections; i++){
         std::vector<std::string> words = Tokenizer(arg[iarg+2+i], ",").as_vector();
         for (const auto &word : words) {
@@ -2573,6 +2583,7 @@ void Neighbor::modify_params(int narg, char **arg)
       }
 
       // Check for undefined atom type
+
       for (i = 1; i <= ntypes; i++){
         if (type2collection[i] == -1) {
           error->all(FLERR,"Type missing in collection/type commnd");
diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp
index d1b5e5bd2e..b5daa111da 100644
--- a/src/pair_hybrid.cpp
+++ b/src/pair_hybrid.cpp
@@ -512,13 +512,16 @@ void PairHybrid::coeff(int narg, char **arg)
   // then unset setflag/map assigned to that style before setting it below
   // in case pair coeff for this sub-style is being called for 2nd time
 
-  if (!none && styles[m]->one_coeff)
+  if (!none && styles[m]->one_coeff) {
+    if ((strcmp(arg[0],"*") != 0) || (strcmp(arg[1],"*") != 0))
+      error->all(FLERR,"Incorrect args for pair coefficients");
     for (int i = 1; i <= atom->ntypes; i++)
       for (int j = i; j <= atom->ntypes; j++)
         if (nmap[i][j] && map[i][j][0] == m) {
           setflag[i][j] = 0;
           nmap[i][j] = 0;
         }
+  }
 
   // set setflag and which type pairs map to which sub-style
   // if sub-style is none: set hybrid setflag, wipe out map
diff --git a/src/pair_hybrid_overlay.cpp b/src/pair_hybrid_overlay.cpp
index db12750f40..e93473e3c9 100644
--- a/src/pair_hybrid_overlay.cpp
+++ b/src/pair_hybrid_overlay.cpp
@@ -70,6 +70,12 @@ void PairHybridOverlay::coeff(int narg, char **arg)
   arg[2+multflag] = arg[1];
   arg[1+multflag] = arg[0];
 
+  // ensure that one_coeff flag is honored
+
+  if (!none && styles[m]->one_coeff)
+    if ((strcmp(arg[0],"*") != 0) || (strcmp(arg[1],"*") != 0))
+      error->all(FLERR,"Incorrect args for pair coefficients");
+
   // invoke sub-style coeff() starting with 1st remaining arg
 
   if (!none) styles[m]->coeff(narg-1-multflag,arg+1+multflag);
diff --git a/src/pair_hybrid_scaled.cpp b/src/pair_hybrid_scaled.cpp
index 90e30dd9b2..5bf593d147 100644
--- a/src/pair_hybrid_scaled.cpp
+++ b/src/pair_hybrid_scaled.cpp
@@ -474,6 +474,12 @@ void PairHybridScaled::coeff(int narg, char **arg)
   arg[2 + multflag] = arg[1];
   arg[1 + multflag] = arg[0];
 
+  // ensure that one_coeff flag is honored
+
+  if (!none && styles[m]->one_coeff)
+    if ((strcmp(arg[0],"*") != 0) || (strcmp(arg[1],"*") != 0))
+      error->all(FLERR,"Incorrect args for pair coefficients");
+
   // invoke sub-style coeff() starting with 1st remaining arg
 
   if (!none) styles[m]->coeff(narg - 1 - multflag, &arg[1 + multflag]);
diff --git a/src/read_data.cpp b/src/read_data.cpp
index c33c65f676..574117e93e 100644
--- a/src/read_data.cpp
+++ b/src/read_data.cpp
@@ -1321,7 +1321,7 @@ void ReadData::bonds(int firstpass)
   int *count = nullptr;
   if (firstpass) {
     memory->create(count,nlocal,"read_data:count");
-    memset(count,0,nlocal*sizeof(int));
+    if (count) memset(count,0,nlocal*sizeof(int));
   }
 
   // read and process bonds
@@ -1395,7 +1395,7 @@ void ReadData::angles(int firstpass)
   int *count = nullptr;
   if (firstpass) {
     memory->create(count,nlocal,"read_data:count");
-    memset(count,0,nlocal*sizeof(int));
+    if (count) memset(count,0,nlocal*sizeof(int));
   }
 
   // read and process angles
@@ -1469,7 +1469,7 @@ void ReadData::dihedrals(int firstpass)
   int *count = nullptr;
   if (firstpass) {
     memory->create(count,nlocal,"read_data:count");
-    memset(count,0,nlocal*sizeof(int));
+    if (count) memset(count,0,nlocal*sizeof(int));
   }
 
   // read and process dihedrals
@@ -1543,7 +1543,7 @@ void ReadData::impropers(int firstpass)
   int *count = nullptr;
   if (firstpass) {
     memory->create(count,nlocal,"read_data:count");
-    memset(count,0,nlocal*sizeof(int));
+    if (count) memset(count,0,nlocal*sizeof(int));
   }
 
   // read and process impropers
diff --git a/src/version.h b/src/version.h
index 6b4ecca26b..c1b2b627a8 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1 +1 @@
-#define LAMMPS_VERSION "20 Sep 2021"
+#define LAMMPS_VERSION "29 Sep 2021"
diff --git a/tools/singularity/centos7.def b/tools/singularity/centos7.def
index f64db0649b..8a3235b58f 100644
--- a/tools/singularity/centos7.def
+++ b/tools/singularity/centos7.def
@@ -36,7 +36,7 @@ From: centos:7
         # manually install Plumed
         mkdir plumed
         cd plumed
-        version=2.6.1
+        version=2.7.2
         curl -L -o plumed.tar.gz https://github.com/plumed/plumed2/releases/download/v${version}/plumed-src-${version}.tgz
         tar -xzf plumed.tar.gz
         cd plumed-${version}
diff --git a/tools/singularity/centos8.def b/tools/singularity/centos8.def
index c48d2718eb..e35f97f453 100644
--- a/tools/singularity/centos8.def
+++ b/tools/singularity/centos8.def
@@ -3,7 +3,7 @@ From: centos:8
 
 %post
         dnf -y install epel-release dnf-utils
-        dnf config-manager --set-enabled PowerTools
+        dnf config-manager --set-enabled powertools
         dnf -y update
         dnf -y install vim-enhanced git file make cmake patch which file ninja-build \
                ccache gcc-c++ gcc-gfortran clang libomp-devel gdb valgrind libubsan libasan libtsan \
@@ -42,7 +42,7 @@ From: centos:8
         # manually install Plumed
         mkdir plumed
         cd plumed
-        version=2.6.1
+        version=2.7.2
         curl -L -o plumed.tar.gz https://github.com/plumed/plumed2/releases/download/v${version}/plumed-src-${version}.tgz
         tar -xzf plumed.tar.gz
         cd plumed-${version}
diff --git a/tools/singularity/fedora34_mingw.def b/tools/singularity/fedora34_mingw.def
index 2f8118778f..40e6f72861 100644
--- a/tools/singularity/fedora34_mingw.def
+++ b/tools/singularity/fedora34_mingw.def
@@ -30,6 +30,7 @@ From: fedora:34
                mingw32-readline mingw64-readline \
                mingw32-termcap mingw64-termcap \
                mingw32-zlib mingw64-zlib \
+               mingw32-zstd mingw64-zstd \
                enchant python3-virtualenv doxygen latexmk \
                texlive-latex-fonts texlive-pslatex texlive-collection-latexrecommended \
                texlive-latex texlive-latexconfig doxygen-latex texlive-collection-latex \
diff --git a/tools/singularity/rocky8.def b/tools/singularity/rocky8.def
new file mode 100644
index 0000000000..0827b1d548
--- /dev/null
+++ b/tools/singularity/rocky8.def
@@ -0,0 +1,110 @@
+BootStrap: docker
+From: rockylinux/rockylinux:8
+
+%post
+        dnf -y install epel-release dnf-utils
+        dnf config-manager --set-enabled powertools
+        dnf -y update
+        dnf -y install vim-enhanced git file make cmake patch which file ninja-build \
+               ccache gcc-c++ gcc-gfortran clang libomp-devel gdb valgrind libubsan libasan libtsan \
+               eigen3-devel openblas-devel libpng-devel libjpeg-devel platform-python-devel \
+               openmpi-devel mpich-devel fftw-devel voro++-devel gsl-devel hdf5-devel \
+               netcdf-devel netcdf-cxx-devel netcdf-mpich-devel netcdf-openmpi-devel \
+               enchant python3-virtualenv doxygen diffutils latexmk readline-devel \
+               texlive-latex-fonts texlive-pslatex texlive-collection-latexrecommended \
+               texlive-latex texlive-latexconfig doxygen-latex texlive-collection-latex \
+               texlive-latex-bin texlive-lualatex-math texlive-fncychap texlive-tabulary \
+               texlive-framed texlive-wrapfig texlive-upquote texlive-capt-of \
+               texlive-needspace texlive-titlesec texlive-anysize texlive-dvipng \
+               blas-devel lapack-devel libyaml-devel openkim-models kim-api-devel \
+               zstd libzstd-devel
+        dnf clean all
+
+        # we need to reset any module variables
+        # inherited from the host.
+        unset __LMOD_REF_COUNT__LMFILES_
+        unset __LMOD_REF_COUNT_PATH
+        unset __LMOD_REF_COUNT_LD_LIBRARY_PATH
+        unset __LMOD_REF_COUNT_MANPATH
+        unset __LMOD_REF_COUNT_MODULEPATH
+        unset __LMOD_REF_COUNT_LOADEDMODULES
+        unset _LMFILES_
+        unset MODULEPATH
+        unset MODULESHOME
+        unset MODULEPATH_ROOT
+        unset LOADEDMODULES
+        unset LMOD_SYSTEM_DEFAULT_MODULES
+
+        # load MPI by default
+        . /etc/profile
+        module load mpi
+
+        # manually install Plumed
+        mkdir plumed
+        cd plumed
+        version=2.7.2
+        curl -L -o plumed.tar.gz https://github.com/plumed/plumed2/releases/download/v${version}/plumed-src-${version}.tgz
+        tar -xzf plumed.tar.gz
+        cd plumed-${version}
+        ./configure --disable-doc --prefix=/usr
+        make
+        make install
+        # fix up installation for CentOS and Fedora
+        mv -v /usr/lib64/pkgconfig/plumed* /usr/share/pkgconfig/
+        cd ../../
+        rm -rvf plumed
+
+        # create missing readline pkgconfig file
+        cat > /usr/lib64/pkgconfig/readline.pc <<EOF
+prefix=/usr
+exec_prefix=/usr
+libdir=/usr/lib64
+includedir=/usr/include
+
+Name: Readline
+Description: GNU Readline library for command line editing
+URL: http://tiswww.cwru.edu/php/chet/readline/rltop.html
+Version: 7.0
+Requires.private: ncurses
+Libs: -L\${libdir} -lreadline
+Cflags: -I\${includedir}/readline
+EOF
+        # set custom prompt indicating the container name
+        CUSTOM_PROMPT_ENV=/.singularity.d/env/99-zz_custom_prompt.sh
+        cat >$CUSTOM_PROMPT_ENV <<EOF
+#!/bin/bash
+PS1="[centos8:\u@\h] \W> "
+EOF
+        chmod 755 $CUSTOM_PROMPT_ENV
+
+
+%environment
+        LC_ALL=C
+        export LC_ALL
+
+        # we need to reset any module variables
+        # inherited from the host.
+        unset __LMOD_REF_COUNT__LMFILES_
+        unset __LMOD_REF_COUNT_PATH
+        unset __LMOD_REF_COUNT_LD_LIBRARY_PATH
+        unset __LMOD_REF_COUNT_MANPATH
+        unset __LMOD_REF_COUNT_MODULEPATH
+        unset __LMOD_REF_COUNT_LOADEDMODULES
+        unset _LMFILES_
+        unset MODULEPATH
+        unset MODULESHOME
+        unset MODULEPATH_ROOT
+        unset LOADEDMODULES
+        unset LMOD_SYSTEM_DEFAULT_MODULES
+
+        # load MPI by default
+        . /etc/profile
+        module load mpi
+        # tell OpenMPI to not try using Infiniband
+        OMPI_MCA_btl="^openib"
+        # do not warn about unused components as this messes up testing
+        OMPI_MCA_btl_base_warn_component_unused="0"
+        export OMPI_MCA_btl OMPI_MCA_btl_base_warn_component_unused
+
+%labels
+        Author akohlmey, rbberger
diff --git a/tools/singularity/ubuntu18.04.def b/tools/singularity/ubuntu18.04.def
index 8d343af7ea..35247d8e2a 100644
--- a/tools/singularity/ubuntu18.04.def
+++ b/tools/singularity/ubuntu18.04.def
@@ -105,7 +105,7 @@ From: ubuntu:18.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu18.04_amd_rocm.def b/tools/singularity/ubuntu18.04_amd_rocm.def
index 7b06970110..407cda1250 100644
--- a/tools/singularity/ubuntu18.04_amd_rocm.def
+++ b/tools/singularity/ubuntu18.04_amd_rocm.def
@@ -3,7 +3,7 @@ From: ubuntu:18.04
 
 %environment
     export PATH=/usr/lib/ccache:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm-4.3.0/llvm/lib
 %post
     export DEBIAN_FRONTEND=noninteractive
     apt-get update
@@ -94,7 +94,7 @@ From: ubuntu:18.04
     ###########################################################################
 
     export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    git clone -b rocm-4.1.x https://github.com/ROCmSoftwarePlatform/hipCUB.git
+    git clone -b release/rocm-rel-4.3 https://github.com/ROCmSoftwarePlatform/hipCUB.git
     mkdir hipCUB/build
     cd hipCUB/build
     CXX=hipcc cmake -D BUILD_TEST=off ..
@@ -129,7 +129,7 @@ From: ubuntu:18.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu18.04_gpu.def b/tools/singularity/ubuntu18.04_gpu.def
index cfd94c8531..9a1d37792e 100644
--- a/tools/singularity/ubuntu18.04_gpu.def
+++ b/tools/singularity/ubuntu18.04_gpu.def
@@ -2,11 +2,11 @@ BootStrap: docker
 From: ubuntu:18.04
 
 %environment
-    export PATH=/usr/lib/ccache:/usr/local/cuda-11.0/bin:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    export CUDADIR=/usr/local/cuda-11.0
-    export CUDA_PATH=/usr/local/cuda-11.0
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.0/lib64
-    export LIBRARY_PATH=/usr/local/cuda-11.0/lib64/stubs
+    export PATH=/usr/lib/ccache:/usr/local/cuda-11.4/bin:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
+    export CUDADIR=/usr/local/cuda-11.4
+    export CUDA_PATH=/usr/local/cuda-11.4
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.4/lib64:/opt/rocm/lib:/opt/rocm-4.3.0/llvm/lib
+    export LIBRARY_PATH=/usr/local/cuda-11.4/lib64/stubs
 %post
     export DEBIAN_FRONTEND=noninteractive
     apt-get update
@@ -104,23 +104,19 @@ From: ubuntu:18.04
     add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
     apt-get update
 
-    export CUDA_PKG_VERSION=11.0
+    export CUDA_PKG_VERSION=11.4
 
     apt-get install -y --no-install-recommends \
-        cuda-libraries-$CUDA_PKG_VERSION \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
-        cuda-libraries-dev-$CUDA_PKG_VERSION \
-        cuda-minimal-build-$CUDA_PKG_VERSION \
+        cuda-libraries-${CUDA_PKG_VERSION} \
+        cuda-command-line-tools-${CUDA_PKG_VERSION} \
+        cuda-libraries-dev-${CUDA_PKG_VERSION} \
+        cuda-minimal-build-${CUDA_PKG_VERSION} \
         cuda-compat-$CUDA_PKG_VERSION \
-        libcublas-11-0 \
-        libcublas-dev-11-0
-
-    echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
+        libcublas-${CUDA_PKG_VERSION} \
+        libcublas-dev-${CUDA_PKG_VERSION}
 
     # add missing symlink
-    ln -s /usr/local/cuda-11.0 /usr/local/cuda
-    ln -s /usr/local/cuda-11.0/lib64/stubs/libcuda.so /usr/local/cuda-11.0/lib64/stubs/libcuda.so.1
+    ln -s /usr/local/cuda-${CUDA_PKG_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_PKG_VERSION}/lib64/stubs/libcuda.so.1
 
     ###########################################################################
     # NVIDIA OpenCL
@@ -134,7 +130,7 @@ From: ubuntu:18.04
     ###########################################################################
 
     export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    git clone -b rocm-3.7.x https://github.com/ROCmSoftwarePlatform/hipCUB.git
+    git clone -b release/rocm-rel-4.3 https://github.com/ROCmSoftwarePlatform/hipCUB.git
     mkdir hipCUB/build
     cd hipCUB/build
     CXX=hipcc cmake -D BUILD_TEST=off ..
@@ -169,7 +165,7 @@ From: ubuntu:18.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu18.04_intel_opencl.def b/tools/singularity/ubuntu18.04_intel_opencl.def
index 01f0d78d0a..95c744c67d 100644
--- a/tools/singularity/ubuntu18.04_intel_opencl.def
+++ b/tools/singularity/ubuntu18.04_intel_opencl.def
@@ -106,7 +106,7 @@ From: ubuntu:18.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu18.04_nvidia.def b/tools/singularity/ubuntu18.04_nvidia.def
index 2b6fcf8c45..359e1d1c4d 100644
--- a/tools/singularity/ubuntu18.04_nvidia.def
+++ b/tools/singularity/ubuntu18.04_nvidia.def
@@ -1,5 +1,5 @@
 BootStrap: docker
-From: nvidia/cuda:11.0-devel-ubuntu18.04
+From: nvidia/cuda:11.4.1-devel-ubuntu18.04
 
 %post
     export DEBIAN_FRONTEND=noninteractive
@@ -105,7 +105,7 @@ From: nvidia/cuda:11.0-devel-ubuntu18.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu20.04.def b/tools/singularity/ubuntu20.04.def
index 7f081ab2e3..f85d3ca614 100644
--- a/tools/singularity/ubuntu20.04.def
+++ b/tools/singularity/ubuntu20.04.def
@@ -100,7 +100,7 @@ From: ubuntu:20.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu20.04_amd_rocm.def b/tools/singularity/ubuntu20.04_amd_rocm.def
index 9db8265629..2b4176f183 100644
--- a/tools/singularity/ubuntu20.04_amd_rocm.def
+++ b/tools/singularity/ubuntu20.04_amd_rocm.def
@@ -3,7 +3,7 @@ From: ubuntu:20.04
 
 %environment
     export PATH=/usr/lib/ccache:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm-4.2.0/llvm/lib
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm-4.3.0/llvm/lib
 %post
     export DEBIAN_FRONTEND=noninteractive
     apt-get update
@@ -91,7 +91,7 @@ From: ubuntu:20.04
     ###########################################################################
 
     export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    git clone -b rocm-4.1.x https://github.com/ROCmSoftwarePlatform/hipCUB.git
+    git clone -b release/rocm-rel-4.3 https://github.com/ROCmSoftwarePlatform/hipCUB.git
     mkdir hipCUB/build
     cd hipCUB/build
     CXX=hipcc cmake -D BUILD_TEST=off ..
@@ -126,7 +126,7 @@ From: ubuntu:20.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu20.04_gpu.def b/tools/singularity/ubuntu20.04_gpu.def
index 44f975d2c8..3ea759078b 100644
--- a/tools/singularity/ubuntu20.04_gpu.def
+++ b/tools/singularity/ubuntu20.04_gpu.def
@@ -2,11 +2,11 @@ BootStrap: docker
 From: ubuntu:20.04
 
 %environment
-    export PATH=/usr/lib/ccache:/usr/local/cuda-11.0/bin:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    export CUDADIR=/usr/local/cuda-11.0
-    export CUDA_PATH=/usr/local/cuda-11.0
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.0/lib64:/opt/rocm/lib:/opt/rocm-4.2.0/llvm/lib
-    export LIBRARY_PATH=/usr/local/cuda-11.0/lib64/stubs
+    export PATH=/usr/lib/ccache:/usr/local/cuda-11.4/bin:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
+    export CUDADIR=/usr/local/cuda-11.4
+    export CUDA_PATH=/usr/local/cuda-11.4
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.4/lib64:/opt/rocm/lib:/opt/rocm-4.3.0/llvm/lib
+    export LIBRARY_PATH=/usr/local/cuda-11.4/lib64/stubs
 %post
     export DEBIAN_FRONTEND=noninteractive
     apt-get update
@@ -101,23 +101,19 @@ From: ubuntu:20.04
     add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
     apt-get update
 
-    export CUDA_PKG_VERSION=11.0
+    export CUDA_PKG_VERSION=11.4
 
     apt-get install -y --no-install-recommends \
-        cuda-libraries-$CUDA_PKG_VERSION \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
-        cuda-libraries-dev-$CUDA_PKG_VERSION \
-        cuda-minimal-build-$CUDA_PKG_VERSION \
+        cuda-libraries-${CUDA_PKG_VERSION} \
+        cuda-command-line-tools-${CUDA_PKG_VERSION} \
+        cuda-libraries-dev-${CUDA_PKG_VERSION} \
+        cuda-minimal-build-${CUDA_PKG_VERSION} \
         cuda-compat-$CUDA_PKG_VERSION \
-        libcublas-11-0 \
-        libcublas-dev-11-0
-
-    echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
+        libcublas-${CUDA_PKG_VERSION} \
+        libcublas-dev-${CUDA_PKG_VERSION}
 
     # add missing symlink
-    ln -s /usr/local/cuda-11.0 /usr/local/cuda
-    ln -s /usr/local/cuda-11.0/lib64/stubs/libcuda.so /usr/local/cuda-11.0/lib64/stubs/libcuda.so.1
+    ln -s /usr/local/cuda-${CUDA_PKG_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_PKG_VERSION}/lib64/stubs/libcuda.so.1
 
     ###########################################################################
     # NVIDIA OpenCL
@@ -131,7 +127,7 @@ From: ubuntu:20.04
     ###########################################################################
 
     export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    git clone -b rocm-4.2.x https://github.com/ROCmSoftwarePlatform/hipCUB.git
+    git clone -b release/rocm-rel-4.3 https://github.com/ROCmSoftwarePlatform/hipCUB.git
     mkdir hipCUB/build
     cd hipCUB/build
     CXX=hipcc cmake -D BUILD_TEST=off ..
@@ -166,7 +162,7 @@ From: ubuntu:20.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu20.04_intel_opencl.def b/tools/singularity/ubuntu20.04_intel_opencl.def
index 82ca53a851..7c83ecb5b1 100644
--- a/tools/singularity/ubuntu20.04_intel_opencl.def
+++ b/tools/singularity/ubuntu20.04_intel_opencl.def
@@ -99,7 +99,7 @@ From: ubuntu:20.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
 
     mkdir plumed
     cd plumed
diff --git a/tools/singularity/ubuntu20.04_nvidia.def b/tools/singularity/ubuntu20.04_nvidia.def
index 7bbc3ab0b5..ddcbd34db9 100644
--- a/tools/singularity/ubuntu20.04_nvidia.def
+++ b/tools/singularity/ubuntu20.04_nvidia.def
@@ -1,5 +1,5 @@
 BootStrap: docker
-From: nvidia/cuda:11.0-devel-ubuntu20.04
+From: nvidia/cuda:11.4.1-devel-ubuntu20.04
 
 %post
     export DEBIAN_FRONTEND=noninteractive
@@ -102,7 +102,7 @@ From: nvidia/cuda:11.0-devel-ubuntu20.04
     # Plumed
     ###########################################################################
 
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
 
     mkdir plumed
     cd plumed
diff --git a/tools/swig/CMakeLists.txt b/tools/swig/CMakeLists.txt
index 204b351ed6..966837dc2f 100644
--- a/tools/swig/CMakeLists.txt
+++ b/tools/swig/CMakeLists.txt
@@ -90,7 +90,15 @@ if(BUILD_SWIG_TCL)
   # build loadable Tcl module
   set_property(SOURCE lammps.i PROPERTY SWIG_MODULE_NAME tcllammps)
   swig_add_library(tcllammps TYPE MODULE LANGUAGE tcl SOURCES lammps.i)
-  find_package(TCL REQUIRED)
+  find_package(TCL)
+  if(NOT TCL_FOUND)
+    message(FATAL_ERROR "Tcl development headers and libraries are required")
+  endif()
+  find_package(TclStub)
+  if(TCL_STUB_LIBRARY)
+    target_compile_definitions(tcllammps PRIVATE USE_TCL_STUBS)
+    target_link_libraries(tcllammps PRIVATE ${TCL_STUB_LIBRARY})
+  endif()
   target_include_directories(tcllammps PRIVATE ${TCL_INCLUDE_PATH})
   swig_link_libraries(tcllammps PRIVATE lammps ${TCL_LIBRARY})
   # build extended Tcl shell binary
diff --git a/unittest/CMakeLists.txt b/unittest/CMakeLists.txt
index bb746c13ec..2491c26796 100644
--- a/unittest/CMakeLists.txt
+++ b/unittest/CMakeLists.txt
@@ -7,7 +7,7 @@ add_test(NAME RunLammps
          COMMAND $<TARGET_FILE:lmp> -log none -echo none -in in.empty
          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 set_tests_properties(RunLammps PROPERTIES
-        ENVIRONMENT "TSAN_OPTIONS=ignore_noninstrumented_modules=1"
+	ENVIRONMENT "TSAN_OPTIONS=ignore_noninstrumented_modules=1;HWLOC_HIDE_ERRORS=1"
         PASS_REGULAR_EXPRESSION "^LAMMPS \\([0-9]+ [A-Za-z]+ 2[0-9][0-9][0-9]\\)")
 
 # check if the compiled executable will print the help message
@@ -15,7 +15,7 @@ add_test(NAME HelpMessage
          COMMAND $<TARGET_FILE:lmp> -h
          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 set_tests_properties(HelpMessage PROPERTIES
-         ENVIRONMENT "TSAN_OPTIONS=ignore_noninstrumented_modules=1"
+	ENVIRONMENT "TSAN_OPTIONS=ignore_noninstrumented_modules=1;HWLOC_HIDE_ERRORS=1"
          PASS_REGULAR_EXPRESSION ".*Large-scale Atomic/Molecular Massively Parallel Simulator -.*Usage example:.*")
 
 # check if the compiled executable will error out on an invalid command line flag
@@ -23,7 +23,7 @@ add_test(NAME InvalidFlag
          COMMAND $<TARGET_FILE:lmp> -xxx
          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 set_tests_properties(InvalidFlag PROPERTIES
-         ENVIRONMENT "TSAN_OPTIONS=ignore_noninstrumented_modules=1"
+	ENVIRONMENT "TSAN_OPTIONS=ignore_noninstrumented_modules=1;HWLOC_HIDE_ERRORS=1"
          PASS_REGULAR_EXPRESSION "ERROR: Invalid command-line argument.*")
 
 if(BUILD_MPI)