Merge branch 'master' into acks2_release

2021-09-30 00:26:25 -04:00
parent cbc5a2933a 4d84ceb822
commit 359aa1d805
568 changed files with 4363 additions and 4091 deletions
--- a/cmake/Modules/OpenCLLoader.cmake
+++ b/cmake/Modules/OpenCLLoader.cmake
@ -1,6 +1,6 @@
 message(STATUS "Downloading and building OpenCL loader library")
-set(OPENCL_LOADER_URL "${LAMMPS_THIRDPARTY_URL}/opencl-loader-2021.06.30.tar.gz" CACHE STRING "URL for OpenCL loader tarball")
+set(OPENCL_LOADER_URL "${LAMMPS_THIRDPARTY_URL}/opencl-loader-2021.09.18.tar.gz" CACHE STRING "URL for OpenCL loader tarball")
-set(OPENCL_LOADER_MD5 "f9e55dd550cfbf77f46507adf7cb8fd2" CACHE STRING "MD5 checksum of OpenCL loader tarball")
+set(OPENCL_LOADER_MD5 "3b3882627964bd02e5c3b02065daac3c" CACHE STRING "MD5 checksum of OpenCL loader tarball")
 mark_as_advanced(OPENCL_LOADER_URL)
 mark_as_advanced(OPENCL_LOADER_MD5)
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@ -71,6 +71,11 @@ if(GPU_API STREQUAL "CUDA")
  # build arch/gencode commands for nvcc based on CUDA toolkit version and use choice
  # --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
  set(GPU_CUDA_GENCODE "-arch=${GPU_ARCH}")
  # apply the following to build "fat" CUDA binaries only for known CUDA toolkits
  if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
    message(WARNING "Untested CUDA Toolkit version. Use at your own risk")
  else()
    # Fermi (GPU Arch 2.x) is supported by CUDA 3.2 to CUDA 8.0
    if((CUDA_VERSION VERSION_GREATER_EQUAL "3.2") AND (CUDA_VERSION VERSION_LESS "9.0"))
      string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_20,code=[sm_20,compute_20] ")
@ -107,8 +112,6 @@ if(GPU_API STREQUAL "CUDA")
    if(CUDA_VERSION VERSION_GREATER_EQUAL "11.1")
      string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_86,code=[sm_86,compute_86]")
    endif()
  if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
    message(WARNING "Unsupported CUDA version. Use at your own risk.")
  endif()
  cuda_compile_fatbin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS ${CUDA_REQUEST_PIC}
@ -214,13 +217,20 @@ elseif(GPU_API STREQUAL "OPENCL")
 elseif(GPU_API STREQUAL "HIP")
  if(NOT DEFINED HIP_PATH)
      if(NOT DEFINED ENV{HIP_PATH})
-          set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+          set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to HIP installation")
      else()
-          set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+          set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to HIP installation")
      endif()
  endif()
-  set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+  if(NOT DEFINED ROCM_PATH)
-  find_package(HIP REQUIRED)
+      if(NOT DEFINED ENV{ROCM_PATH})
          set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation")
      else()
          set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to ROCm installation")
      endif()
  endif()
  list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH})
  find_package(hip REQUIRED)
  option(HIP_USE_DEVICE_SORT "Use GPU sorting" ON)
  if(NOT DEFINED HIP_PLATFORM)
@ -322,10 +332,11 @@ elseif(GPU_API STREQUAL "HIP")
  set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h ${LAMMPS_LIB_BINARY_DIR}/gpu/*.cu.cpp")
-  hip_add_library(gpu STATIC ${GPU_LIB_SOURCES})
+  add_library(gpu STATIC ${GPU_LIB_SOURCES})
  target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu)
  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
  target_compile_definitions(gpu PRIVATE -DUSE_HIP)
  target_link_libraries(gpu PRIVATE hip::host)
  if(HIP_USE_DEVICE_SORT)
    # add hipCUB
@ -374,8 +385,9 @@ elseif(GPU_API STREQUAL "HIP")
    endif()
  endif()
-  hip_add_executable(hip_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
+  add_executable(hip_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
  target_compile_definitions(hip_get_devices PRIVATE -DUCL_HIP)
  target_link_libraries(hip_get_devices hip::host)
  if(HIP_PLATFORM STREQUAL "nvcc")
    target_compile_definitions(gpu PRIVATE -D__HIP_PLATFORM_NVCC__)
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@ -1,6 +1,8 @@
 ########################################################################
 # As of version 3.3.0 Kokkos requires C++14
 if(CMAKE_CXX_STANDARD LESS 14)
  set(CMAKE_CXX_STANDARD 14)
 endif()
 ########################################################################
 # consistency checks and Kokkos options/settings required by LAMMPS
 if(Kokkos_ENABLE_CUDA)
--- a/cmake/Modules/Packages/LATTE.cmake
+++ b/cmake/Modules/Packages/LATTE.cmake
@ -19,6 +19,14 @@ if(DOWNLOAD_LATTE)
  set(LATTE_MD5 "820e73a457ced178c08c71389a385de7" CACHE STRING "MD5 checksum of LATTE tarball")
  mark_as_advanced(LATTE_URL)
  mark_as_advanced(LATTE_MD5)
  # CMake cannot pass BLAS or LAPACK library variable to external project if they are a list
  list(LENGTH BLAS_LIBRARIES} NUM_BLAS)
  list(LENGTH LAPACK_LIBRARIES NUM_LAPACK)
  if((NUM_BLAS GREATER 1) OR (NUM_LAPACK GREATER 1))
    message(FATAL_ERROR "Cannot compile downloaded LATTE library due to a technical limitation")
  endif()
  include(ExternalProject)
  ExternalProject_Add(latte_build
    URL     ${LATTE_URL}
--- a/cmake/Modules/Packages/ML-HDNNP.cmake
+++ b/cmake/Modules/Packages/ML-HDNNP.cmake
@ -45,12 +45,12 @@ if(DOWNLOAD_N2P2)
    # get path to MPI include directory when cross-compiling to windows
    if((CMAKE_SYSTEM_NAME STREQUAL Windows) AND CMAKE_CROSSCOMPILING)
      get_target_property(N2P2_MPI_INCLUDE MPI::MPI_CXX INTERFACE_INCLUDE_DIRECTORIES)
-      set(N2P2_PROJECT_OPTIONS "-I ${N2P2_MPI_INCLUDE} -DMPICH_SKIP_MPICXX=1")
+      set(N2P2_PROJECT_OPTIONS "-I${N2P2_MPI_INCLUDE}")
      set(MPI_CXX_COMPILER ${CMAKE_CXX_COMPILER})
    endif()
    if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
      get_target_property(N2P2_MPI_INCLUDE MPI::MPI_CXX INTERFACE_INCLUDE_DIRECTORIES)
-      set(N2P2_PROJECT_OPTIONS "-I ${N2P2_MPI_INCLUDE} -DMPICH_SKIP_MPICXX=1")
+      set(N2P2_PROJECT_OPTIONS "-I${N2P2_MPI_INCLUDE}")
      set(MPI_CXX_COMPILER ${CMAKE_CXX_COMPILER})
    endif()
  endif()
@ -69,6 +69,12 @@ if(DOWNLOAD_N2P2)
  # echo final flag for debugging
  message(STATUS "N2P2 BUILD OPTIONS: ${N2P2_BUILD_OPTIONS}")
  # must have "sed" command to compile n2p2 library (for now)
  find_program(HAVE_SED sed)
  if(NOT HAVE_SED)
    message(FATAL_ERROR "Must have 'sed' program installed to compile 'n2p2' library for ML-HDNNP package")
  endif()
  # download compile n2p2 library. much patch MPI calls in LAMMPS interface to accommodate MPI-2 (e.g. for cross-compiling)
  include(ExternalProject)
  ExternalProject_Add(n2p2_build
--- a/cmake/Modules/Packages/ML-QUIP.cmake
+++ b/cmake/Modules/Packages/ML-QUIP.cmake
@ -38,7 +38,7 @@ if(DOWNLOAD_QUIP)
  set(temp "${temp}HAVE_LOCAL_E_MIX=0\nHAVE_QC=0\nHAVE_GAP=1\nHAVE_DESCRIPTORS_NONCOMMERCIAL=1\n")
  set(temp "${temp}HAVE_TURBOGAP=0\nHAVE_QR=1\nHAVE_THIRDPARTY=0\nHAVE_FX=0\nHAVE_SCME=0\nHAVE_MTP=0\n")
  set(temp "${temp}HAVE_MBD=0\nHAVE_TTM_NF=0\nHAVE_CH4=0\nHAVE_NETCDF4=0\nHAVE_MDCORE=0\nHAVE_ASAP=0\n")
-  set(temp "${temp}HAVE_CGAL=0\nHAVE_METIS=0\nHAVE_LMTO_TBE=0\n")
+  set(temp "${temp}HAVE_CGAL=0\nHAVE_METIS=0\nHAVE_LMTO_TBE=0\nHAVE_SCALAPACK=0\n")
  file(WRITE ${CMAKE_BINARY_DIR}/quip.config "${temp}")
  message(STATUS "QUIP download via git requested - we will build our own")
@ -50,7 +50,7 @@ if(DOWNLOAD_QUIP)
    GIT_TAG origin/public
    GIT_SHALLOW YES
    GIT_PROGRESS YES
-    PATCH_COMMAND cp ${CMAKE_BINARY_DIR}/quip.config <SOURCE_DIR>/arch/Makefile.lammps
+    PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_BINARY_DIR}/quip.config <SOURCE_DIR>/arch/Makefile.lammps
    CONFIGURE_COMMAND env QUIP_ARCH=lammps make config
    BUILD_COMMAND env QUIP_ARCH=lammps make libquip
    INSTALL_COMMAND ""
--- a/cmake/Modules/Packages/MSCG.cmake
+++ b/cmake/Modules/Packages/MSCG.cmake
@ -12,6 +12,13 @@ if(DOWNLOAD_MSCG)
  mark_as_advanced(MSCG_URL)
  mark_as_advanced(MSCG_MD5)
  # CMake cannot pass BLAS or LAPACK library variable to external project if they are a list
  list(LENGTH BLAS_LIBRARIES} NUM_BLAS)
  list(LENGTH LAPACK_LIBRARIES NUM_LAPACK)
  if((NUM_BLAS GREATER 1) OR (NUM_LAPACK GREATER 1))
    message(FATAL_ERROR "Cannot compile downloaded MSCG library due to a technical limitation")
  endif()
  include(ExternalProject)
  ExternalProject_Add(mscg_build
    URL     ${MSCG_URL}
--- a/cmake/Modules/Packages/SCAFACOS.cmake
+++ b/cmake/Modules/Packages/SCAFACOS.cmake
@ -23,6 +23,11 @@ if(DOWNLOAD_SCAFACOS)
  file(DOWNLOAD ${LAMMPS_THIRDPARTY_URL}/scafacos-1.0.1-fix.diff ${CMAKE_CURRENT_BINARY_DIR}/scafacos-1.0.1.fix.diff
          EXPECTED_HASH MD5=4baa1333bb28fcce102d505e1992d032)
  find_program(HAVE_PATCH patch)
  if(NOT HAVE_PATCH)
    message(FATAL_ERROR "The 'patch' program is required to build the ScaFaCoS library")
  endif()
  include(ExternalProject)
  ExternalProject_Add(scafacos_build
    URL     ${SCAFACOS_URL}
--- a/cmake/Modules/Packages/VORONOI.cmake
+++ b/cmake/Modules/Packages/VORONOI.cmake
@ -26,6 +26,11 @@ if(DOWNLOAD_VORO)
    set(VORO_BUILD_OPTIONS CXX=${CMAKE_CXX_COMPILER} CFLAGS=${VORO_BUILD_CFLAGS})
  endif()
  find_program(HAVE_PATCH patch)
  if(NOT HAVE_PATCH)
    message(FATAL_ERROR "The 'patch' program is required to build the voro++ library")
  endif()
  ExternalProject_Add(voro_build
    URL     ${VORO_URL}
    URL_MD5 ${VORO_MD5}
--- a/cmake/iwyu/iwyu-extra-map.imp
+++ b/cmake/iwyu/iwyu-extra-map.imp
@ -1,7 +1,28 @@
 [
  { include: [ "<bits/types/struct_rusage.h>", private, "<sys/resource.h>", public ] },
  { include: [ "<bits/exception.h>", public, "<exception>", public ] },
  { include: [ "@<Eigen/.*>", private, "<Eigen/Eigen>", public ] },
  { include: [ "@<gtest/.*>", private, "\"gtest/gtest.h\"", public ] },
  { include: [ "@<gmock/.*>", private, "\"gmock/gmock.h\"", public ] },
  { include: [ "@<gmock/.*>", private, "\"gmock/gmock.h\"", public ] },
  { include: [ "@<(cell|c_loops|container).hh>", private, "<voro++.hh>", public ] },
  { include: [ "@\"atom_vec_.*.h\"", public, "\"style_atom.h\"", public ] },
  { include: [ "@\"body_.*.h\"", public, "\"style_body.h\"", public ] },
  { include: [ "@\"compute_.*.h\"", public, "\"style_compute.h\"", public ] },
  { include: [ "@\"fix_.*.h\"", public, "\"style_fix.h\"", public ] },
  { include: [ "@\"dump_.*.h\"", public, "\"style_dump.h\"", public ] },
  { include: [ "@\"min_.*.h\"", public, "\"style_minimize.h\"", public ] },
  { include: [ "@\"reader_.*.h\"", public, "\"style_reader.h\"", public ] },
  { include: [ "@\"region_.*.h\"", public, "\"style_region.h\"", public ] },
  { include: [ "@\"pair_.*.h\"", public, "\"style_pair.h\"", public ] },
  { include: [ "@\"angle_.*.h\"", public, "\"style_angle.h\"", public ] },
  { include: [ "@\"bond_.*.h\"", public, "\"style_bond.h\"", public ] },
  { include: [ "@\"dihedral_.*.h\"", public, "\"style_dihedral.h\"", public ] },
  { include: [ "@\"improper_.*.h\"", public, "\"style_improper.h\"", public ] },
  { include: [ "@\"kspace_.*.h\"", public, "\"style_kspace.h\"", public ] },
  { include: [ "@\"nbin_.*.h\"", public, "\"style_nbin.h\"", public ] },
  { include: [ "@\"npair_.*.h\"", public, "\"style_npair.h\"", public ] },
  { include: [ "@\"nstenci_.*.h\"", public, "\"style_nstencil.h\"", public ] },
  { include: [ "@\"ntopo_.*.h\"", public, "\"style_ntopo.h\"", public ] },
  { include: [ "<float.h>",   public, "<cfloat>", public ] },
  { include: [ "<limits.h>",  public, "<climits>", public ] },
  { include: [ "<bits/types/struct_tm.h>", private, "<ctime>", public ] },
 ]
--- a/cmake/presets/hip_amd.cmake
+++ b/cmake/presets/hip_amd.cmake
@ -0,0 +1,30 @@
 # preset that will enable hip (clang/clang++) with support for MPI and OpenMP (on Linux boxes)
 # prefer flang over gfortran, if available
 find_program(CLANG_FORTRAN NAMES flang gfortran f95)
 set(ENV{OMPI_FC} ${CLANG_FORTRAN})
 set(CMAKE_CXX_COMPILER "hipcc" CACHE STRING "" FORCE)
 set(CMAKE_C_COMPILER "hipcc" CACHE STRING "" FORCE)
 set(CMAKE_Fortran_COMPILER ${CLANG_FORTRAN} CACHE STRING "" FORCE)
 set(CMAKE_CXX_FLAGS_DEBUG "-Wall -Wextra -g" CACHE STRING "" FORCE)
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Wall -Wextra -g -O2 -DNDEBUG" CACHE STRING "" FORCE)
 set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "" FORCE)
 set(CMAKE_Fortran_FLAGS_DEBUG "-Wall -Wextra -g -std=f2003" CACHE STRING "" FORCE)
 set(CMAKE_Fortran_FLAGS_RELWITHDEBINFO "-Wall -Wextra -g -O2 -DNDEBUG -std=f2003" CACHE STRING "" FORCE)
 set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -DNDEBUG -std=f2003" CACHE STRING "" FORCE)
 set(CMAKE_C_FLAGS_DEBUG "-Wall -Wextra -g" CACHE STRING "" FORCE)
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-Wall -Wextra -g -O2 -DNDEBUG" CACHE STRING "" FORCE)
 set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "" FORCE)
 set(MPI_CXX "hipcc" CACHE STRING "" FORCE)
 set(MPI_CXX_COMPILER "mpicxx" CACHE STRING "" FORCE)
 unset(HAVE_OMP_H_INCLUDE CACHE)
 set(OpenMP_C "hipcc" CACHE STRING "" FORCE)
 set(OpenMP_C_FLAGS "-fopenmp" CACHE STRING "" FORCE)
 set(OpenMP_C_LIB_NAMES "omp" CACHE STRING "" FORCE)
 set(OpenMP_CXX "hipcc" CACHE STRING "" FORCE)
 set(OpenMP_CXX_FLAGS "-fopenmp" CACHE STRING "" FORCE)
 set(OpenMP_CXX_LIB_NAMES "omp" CACHE STRING "" FORCE)
 set(OpenMP_omp_LIBRARY "libomp.so" CACHE PATH "" FORCE)
--- a/cmake/presets/most.cmake
+++ b/cmake/presets/most.cmake
@ -24,6 +24,7 @@ set(ALL_PACKAGES
  DRUDE
  EFF
  EXTRA-COMPUTE
  EXTRA-DUMP
  EXTRA-FIX
  EXTRA-MOLECULE
  EXTRA-PAIR
--- a/doc/lammps.1
+++ b/doc/lammps.1
@ -1,4 +1,4 @@
-.TH LAMMPS "31 August 2021" "2021-08-31"
+.TH LAMMPS "29 September 2021" "2021-09-29"
 .SH NAME
 .B LAMMPS
 \- Molecular Dynamics Simulator.
--- a/doc/src/Build_development.rst
+++ b/doc/src/Build_development.rst
@ -58,13 +58,16 @@ Report missing and unneeded '#include' statements (CMake only)
 The conventions for how and when to use and order include statements in
 LAMMPS are documented in :doc:`Modify_style`.  To assist with following
 these conventions one can use the `Include What You Use tool <https://include-what-you-use.org/>`_.
-This is still under development and for large and complex projects like LAMMPS
+This tool is still under development and for large and complex projects like LAMMPS
 there are some false positives, so suggested changes need to be verified manually.
-It is recommended to use at least version 0.14, which has much fewer incorrect
+It is recommended to use at least version 0.16, which has much fewer incorrect
-reports than earlier versions.
+reports than earlier versions.  To install the IWYU toolkit, you need to have
 the clang compiler **and** its development package installed.  Download the IWYU
 version that matches the version of the clang compiler, configure, build, and
 install it.
-The necessary steps to generate the report can be enabled via a
+The necessary steps to generate the report can be enabled via a CMake variable
-CMake variable:
+during CMake configuration.
 .. code-block:: bash
--- a/doc/src/Build_settings.rst
+++ b/doc/src/Build_settings.rst
@ -71,7 +71,8 @@ LAMMPS can use them if they are available on your system.
         -D FFTW3_INCLUDE_DIR=path   # path to FFTW3 include files
         -D FFTW3_LIBRARY=path       # path to FFTW3 libraries
-         -D FFT_FFTW_THREADS=on      # enable using threaded FFTW3 libraries
+         -D FFTW3_OMP_LIBRARY=path   # path to FFTW3 OpenMP wrapper libraries
         -D FFT_FFTW_THREADS=on      # enable using OpenMP threaded FFTW3 libraries
         -D MKL_INCLUDE_DIR=path     # ditto for Intel MKL library
         -D FFT_MKL_THREADS=on       # enable using threaded FFTs with MKL libraries
         -D MKL_LIBRARY=path         # path to MKL libraries
--- a/doc/src/Developer.rst
+++ b/doc/src/Developer.rst
@ -11,6 +11,7 @@ of time and requests from the LAMMPS user community.
   :maxdepth: 1
   Developer_org
   Developer_parallel
   Developer_flow
   Developer_write
   Developer_notes
--- a/doc/src/Developer_par_comm.rst
+++ b/doc/src/Developer_par_comm.rst
@ -0,0 +1,120 @@
 Communication
 ^^^^^^^^^^^^^
 Following the partitioning scheme in use all per-atom data is
 distributed across the MPI processes, which allows LAMMPS to handle very
 large systems provided it uses a correspondingly large number of MPI
 processes.  Since The per-atom data (atom IDs, positions, velocities,
 types, etc.)  To be able to compute the short-range interactions MPI
 processes need not only access to data of atoms they "own" but also
 information about atoms from neighboring sub-domains, in LAMMPS referred
 to as "ghost" atoms.  These are copies of atoms storing required
 per-atom data for up to the communication cutoff distance. The green
 dashed-line boxes in the :ref:`domain-decomposition` figure illustrate
 the extended ghost-atom sub-domain for one processor.
 This approach is also used to implement periodic boundary
 conditions: atoms that lie within the cutoff distance across a periodic
 boundary are also stored as ghost atoms and taken from the periodic
 replication of the sub-domain, which may be the same sub-domain, e.g. if
 running in serial.  As a consequence of this, force computation in
 LAMMPS is not subject to minimum image conventions and thus cutoffs may
 be larger than half the simulation domain.
 .. _ghost-atom-comm:
 .. figure:: img/ghost-comm.png
   :align: center
   ghost atom communication
   This figure shows the ghost atom communication patterns between
   sub-domains for "brick" (left) and "tiled" communication styles for
   2d simulations.  The numbers indicate MPI process ranks.  Here the
   sub-domains are drawn spatially separated for clarity.  The
   dashed-line box is the extended sub-domain of processor 0 which
   includes its ghost atoms.  The red- and blue-shaded boxes are the
   regions of communicated ghost atoms.
 Efficient communication patterns are needed to update the "ghost" atom
 data, since that needs to be done at every MD time step or minimization
 step.  The diagrams of the `ghost-atom-comm` figure illustrate how ghost
 atom communication is performed in two stages for a 2d simulation (three
 in 3d) for both a regular and irregular partitioning of the simulation
 box.  For the regular case (left) atoms are exchanged first in the
 *x*-direction, then in *y*, with four neighbors in the grid of processor
 sub-domains.
 In the *x* stage, processor ranks 1 and 2 send owned atoms in their
 red-shaded regions to rank 0 (and vice versa).  Then in the *y* stage,
 ranks 3 and 4 send atoms in their blue-shaded regions to rank 0, which
 includes ghost atoms they received in the *x* stage.  Rank 0 thus
 acquires all its ghost atoms; atoms in the solid blue corner regions
 are communicated twice before rank 0 receives them.
 For the irregular case (right) the two stages are similar, but a
 processor can have more than one neighbor in each direction.  In the
 *x* stage, MPI ranks 1,2,3 send owned atoms in their red-shaded regions to
 rank 0 (and vice versa).  These include only atoms between the lower
 and upper *y*-boundary of rank 0's sub-domain.  In the *y* stage, ranks
 4,5,6 send atoms in their blue-shaded regions to rank 0.  This may
 include ghost atoms they received in the *x* stage, but only if they
 are needed by rank 0 to fill its extended ghost atom regions in the
 +/-*y* directions (blue rectangles).  Thus in this case, ranks 5 and
 6 do not include ghost atoms they received from each other (in the *x*
 stage) in the atoms they send to rank 0.  The key point is that while
 the pattern of communication is more complex in the irregular
 partitioning case, it can still proceed in two stages (three in 3d)
 via atom exchanges with only neighboring processors.
 When attributes of owned atoms are sent to neighboring processors to
 become attributes of their ghost atoms, LAMMPS calls this a "forward"
 communication.  On timesteps when atoms migrate to new owning processors
 and neighbor lists are rebuilt, each processor creates a list of its
 owned atoms which are ghost atoms in each of its neighbor processors.
 These lists are used to pack per-atom coordinates (for example) into
 message buffers in subsequent steps until the next reneighboring.
 A "reverse" communication is when computed ghost atom attributes are
 sent back to the processor who owns the atom.  This is used (for
 example) to sum partial forces on ghost atoms to the complete force on
 owned atoms.  The order of the two stages described in the
 :ref:`ghost-atom-comm` figure is inverted and the same lists of atoms
 are used to pack and unpack message buffers with per-atom forces.  When
 a received buffer is unpacked, the ghost forces are summed to owned atom
 forces.  As in forward communication, forces on atoms in the four blue
 corners of the diagrams are sent, received, and summed twice (once at
 each stage) before owning processors have the full force.
 These two operations are used many places within LAMMPS aside from
 exchange of coordinates and forces, for example by manybody potentials
 to share intermediate per-atom values, or by rigid-body integrators to
 enable each atom in a body to access body properties.  Here are
 additional details about how these communication operations are
 performed in LAMMPS:
 - When exchanging data with different processors, forward and reverse
  communication is done using ``MPI_Send()`` and ``MPI_IRecv()`` calls.
  If a processor is "exchanging" atoms with itself, only the pack and
  unpack operations are performed, e.g. to create ghost atoms across
  periodic boundaries when running on a single processor.
 - For forward communication of owned atom coordinates, periodic box
  lengths are added and subtracted when the receiving processor is
  across a periodic boundary from the sender.  There is then no need to
  apply a minimum image convention when calculating distances between
  atom pairs when building neighbor lists or computing forces.
 - The cutoff distance for exchanging ghost atoms is typically equal to
  the neighbor cutoff.  But it can also chosen to be longer if needed,
  e.g. half the diameter of a rigid body composed of multiple atoms or
  over 3x the length of a stretched bond for dihedral interactions.  It
  can also exceed the periodic box size.  For the regular communication
  pattern (left), if the cutoff distance extends beyond a neighbor
  processor's sub-domain, then multiple exchanges are performed in the
  same direction.  Each exchange is with the same neighbor processor,
  but buffers are packed/unpacked using a different list of atoms. For
  forward communication, in the first exchange a processor sends only
  owned atoms.  In subsequent exchanges, it sends ghost atoms received
  in previous exchanges.  For the irregular pattern (right) overlaps of
  a processor's extended ghost-atom sub-domain with all other processors
  in each dimension are detected.
--- a/doc/src/Developer_par_long.rst
+++ b/doc/src/Developer_par_long.rst
@ -0,0 +1,188 @@
 Long-range interactions
 ^^^^^^^^^^^^^^^^^^^^^^^
 For charged systems, LAMMPS can compute long-range Coulombic
 interactions via the FFT-based particle-particle/particle-mesh (PPPM)
 method implemented in :doc:`kspace style pppm and its variants
 <kspace_style>`.  For that Coulombic interactions are partitioned into
 short- and long-range components.  The short-ranged portion is computed
 in real space as a loop over pairs of charges within a cutoff distance,
 using neighbor lists.  The long-range portion is computed in reciprocal
 space using a kspace style.  For the PPPM implementation the simulation
 cell is overlaid with a regular FFT grid in 3d. It proceeds in several stages:
 a) each atom's point charge is interpolated to nearby FFT grid points,
 b) a forward 3d FFT is performed,
 c) a convolution operation is performed in reciprocal space,
 d) one or more inverse 3d FFTs are performed, and
 e) electric field values from grid points near each atom are interpolated to compute
   its forces.
 For any of the spatial-decomposition partitioning schemes each processor
 owns the brick-shaped portion of FFT grid points contained within its
 sub-domain.  The two interpolation operations use a stencil of grid
 points surrounding each atom.  To accommodate the stencil size, each
 processor also stores a few layers of ghost grid points surrounding its
 brick.  Forward and reverse communication of grid point values is
 performed similar to the corresponding :doc:`atom data communication
 <Developer_par_comm>`.  In this case, electric field values on owned
 grid points are sent to neighboring processors to become ghost point
 values.  Likewise charge values on ghost points are sent and summed to
 values on owned points.
 For triclinic simulation boxes, the FFT grid planes are parallel to
 the box faces, but the mapping of charge and electric field values
 to/from grid points is done in reduced coordinates where the tilted
 box is conceptually a unit cube, so that the stencil and FFT
 operations are unchanged.  However the FFT grid size required for a
 given accuracy is larger for triclinic domains than it is for
 orthogonal boxes.
 .. _fft-parallel:
 .. figure:: img/fft-decomp-parallel.png
   :align: center
   parallel FFT in PPPM
   Stages of a parallel FFT for a simulation domain overlaid
   with an 8x8x8 3d FFT grid, partitioned across 64 processors.
   Within each of the 4 diagrams, grid cells of the same color are
   owned by a single processor; for simplicity only cells owned by 4
   or 8 of the 64 processors are colored.  The two images on the left
   illustrate brick-to-pencil communication.  The two images on the
   right illustrate pencil-to-pencil communication, which in this
   case transposes the *y* and *z* dimensions of the grid.
 Parallel 3d FFTs require substantial communication relative to their
 computational cost.  A 3d FFT is implemented by a series of 1d FFTs
 along the *x-*, *y-*, and *z-*\ direction of the FFT grid.  Thus the FFT
 grid cannot be decomposed like atoms into 3 dimensions for parallel
 processing of the FFTs but only in 1 (as planes) or 2 (as pencils)
 dimensions and in between the steps the grid needs to be transposed to
 have the FFT grid portion "owned" by each MPI process complete in the
 direction of the 1d FFTs it has to perform. LAMMPS uses the
 pencil-decomposition algorithm as shown in the :ref:`fft-parallel` figure.
 Initially (far left), each processor owns a brick of same-color grid
 cells (actually grid points) contained within in its sub-domain.  A
 brick-to-pencil communication operation converts this layout to 1d
 pencils in the *x*-dimension (center left).  Again, cells of the same
 color are owned by the same processor.  Each processor can then compute
 a 1d FFT on each pencil of data it wholly owns using a call to the
 configured FFT library.  A pencil-to-pencil communication then converts
 this layout to pencils in the *y* dimension (center right) which
 effectively transposes the *x* and *y* dimensions of the grid, followed
 by 1d FFTs in *y*.  A final transpose of pencils from *y* to *z* (far
 right) followed by 1d FFTs in *z* completes the forward FFT.  The data
 is left in a *z*-pencil layout for the convolution operation.  One or
 more inverse FFTs then perform the sequence of 1d FFTs and communication
 steps in reverse order; the final layout of resulting grid values is the
 same as the initial brick layout.
 Each communication operation within the FFT (brick-to-pencil or
 pencil-to-pencil or pencil-to-brick) converts one tiling of the 3d grid
 to another, where a tiling in this context means an assignment of a
 small brick-shaped subset of grid points to each processor, the union of
 which comprise the entire grid.  The parallel `fftMPI library
 <https://lammps.github.io/fftmpi/>`_ written for LAMMPS allows arbitrary
 definitions of the tiling so that an irregular partitioning of the
 simulation domain can use it directly.  Transforming data from one
 tiling to another is implemented in `fftMPI` using point-to-point
 communication, where each processor sends data to a few other
 processors, since each tile in the initial tiling overlaps with a
 handful of tiles in the final tiling.
 The transformations could also be done using collective communication
 across all $P$ processors with a single call to ``MPI_Alltoall()``, but
 this is typically much slower.  However, for the specialized brick and
 pencil tiling illustrated in :ref:`fft-parallel` figure, collective
 communication across the entire MPI communicator is not required.  In
 the example an :math:`8^3` grid with 512 grid cells is partitioned
 across 64 processors; each processor owns a 2x2x2 3d brick of grid
 cells.  The initial brick-to-pencil communication (upper left to upper
 right) only requires collective communication within subgroups of 4
 processors, as illustrated by the 4 colors.  More generally, a
 brick-to-pencil communication can be performed by partitioning *P*
 processors into :math:`P^{\frac{2}{3}}` subgroups of
 :math:`P^{\frac{1}{3}}` processors each.  Each subgroup performs
 collective communication only within its subgroup.  Similarly,
 pencil-to-pencil communication can be performed by partitioning *P*
 processors into :math:`P^{\frac{1}{2}}` subgroups of
 :math:`P^{\frac{1}{2}}` processors each.  This is illustrated in the
 figure for the :math:`y \Rightarrow z` communication (center).  An
 eight-processor subgroup owns the front *yz* plane of data and performs
 collective communication within the subgroup to transpose from a
 *y*-pencil to *z*-pencil layout.
 LAMMPS invokes point-to-point communication by default, but also
 provides the option of partitioned collective communication when using a
 :doc:`kspace_modify collective yes <kspace_modify>` command to switch to
 that mode.  In the latter case, the code detects the size of the
 disjoint subgroups and partitions the single *P*-size communicator into
 multiple smaller communicators, each of which invokes collective
 communication.  Testing on a large IBM Blue Gene/Q machine at Argonne
 National Labs showed a significant improvement in FFT performance for
 large processor counts; partitioned collective communication was faster
 than point-to-point communication or global collective communication
 involving all *P* processors.
 Here are some additional details about FFTs for long-range and related
 grid/particle operations that LAMMPS supports:
 - The fftMPI library allows each grid dimension to be a multiple of
  small prime factors (2,3,5), and allows any number of processors to
  perform the FFT.  The resulting brick and pencil decompositions are
  thus not always as well-aligned but the size of subgroups of
  processors for the two modes of communication (brick/pencil and
  pencil/pencil) still scale as :math:`O(P^{\frac{1}{3}})` and
  :math:`O(P^{\frac{1}{2}})`.
 - For efficiency in performing 1d FFTs, the grid transpose
  operations illustrated in Figure \ref{fig:fft} also involve
  reordering the 3d data so that a different dimension is contiguous
  in memory.  This reordering can be done during the packing or
  unpacking of buffers for MPI communication.
 - For large systems and particularly a large number of MPI processes,
  the dominant cost for parallel FFTs is often the communication, not
  the computation of 1d FFTs, even though the latter scales as :math:`N
  \log(N)` in the number of grid points *N* per grid direction.  This is
  due to the fact that only a 2d decomposition into pencils is possible
  while atom data (and their corresponding short-range force and energy
  computations) can be decomposed efficiently in 3d.
  This can be addressed by reducing the number of MPI processes involved
  in the MPI communication by using :doc:`hybrid MPI + OpenMP
  parallelization <Speed_omp>`.  This will use OpenMP parallelization
  inside the MPI domains and while that may have a lower parallel
  efficiency, it reduces the communication overhead.
  As an alternative it is also possible to start a :ref:`multi-partition
  <partition>` calculation and then use the :doc:`verlet/split
  integrator <run_style>` to perform the PPPM computation on a
  dedicated, separate partition of MPI processes.  This uses an integer
  "1:*p*" mapping of *p* sub-domains of the atom decomposition to one
  sub-domain of the FFT grid decomposition and where pairwise non-bonded
  and bonded forces and energies are computed on the larger partition
  and the PPPM kspace computation concurrently on the smaller partition.
 - LAMMPS also implements PPPM-based solvers for other long-range
  interactions, dipole and dispersion (Lennard-Jones), which can be used
  in conjunction with long-range  Coulombics for point charges.
 - LAMMPS implements a ``GridComm`` class which overlays the simulation
  domain with a regular grid, partitions it across processors in a
  manner consistent with processor sub-domains, and provides methods for
  forward and reverse communication of owned and ghost grid point
  values.  It is used for PPPM as an FFT grid (as outlined above) and
  also for the MSM algorithm which uses a cascade of grid sizes from
  fine to coarse to compute long-range Coulombic forces.  The GridComm
  class is also useful for models where continuum fields interact with
  particles.  For example, the two-temperature model (TTM) defines heat
  transfer between atoms (particles) and electrons (continuum gas) where
  spatial variations in the electron temperature are computed by finite
  differences of a discretized heat equation on a regular grid.  The
  :doc:`fix ttm/grid <fix_ttm>` command uses the ``GridComm`` class
  internally to perform its grid operations on a distributed grid
  instead of the original :doc:`fix ttm <fix_ttm>` which uses a
  replicated grid.
--- a/doc/src/Developer_par_neigh.rst
+++ b/doc/src/Developer_par_neigh.rst
@ -0,0 +1,159 @@
 Neighbor lists
 ^^^^^^^^^^^^^^
 To compute forces efficiently, each processor creates a Verlet-style
 neighbor list which enumerates all pairs of atoms *i,j* (*i* = owned,
 *j* = owned or ghost) with separation less than the applicable
 neighbor list cutoff distance.  In LAMMPS the neighbor lists are stored
 in a multiple-page data structure; each page is a contiguous chunk of
 memory which stores vectors of neighbor atoms *j* for many *i* atoms.
 This allows pages to be incrementally allocated or deallocated in blocks
 as needed.  Neighbor lists typically consume the most memory of any data
 structure in LAMMPS.  The neighbor list is rebuilt (from scratch) once
 every few timesteps, then used repeatedly each step for force or other
 computations.  The neighbor cutoff distance is :math:`R_n = R_f +
 \Delta_s`, where :math:`R_f` is the (largest) force cutoff defined by
 the interatomic potential for computing short-range pairwise or manybody
 forces and :math:`\Delta_s` is a "skin" distance that allows the list to
 be used for multiple steps assuming that atoms do not move very far
 between consecutive time steps.  Typically the code triggers
 reneighboring when any atom has moved half the skin distance since the
 last reneighboring; this and other options of the neighbor list rebuild
 can be adjusted with the :doc:`neigh_modify <neigh_modify>` command.
 On steps when reneighboring is performed, atoms which have moved outside
 their owning processor's sub-domain are first migrated to new processors
 via communication.  Periodic boundary conditions are also (only)
 enforced on these steps to ensure each atom is re-assigned to the
 correct processor.  After migration, the atoms owned by each processor
 are stored in a contiguous vector.  Periodically each processor
 spatially sorts owned atoms within its vector to reorder it for improved
 cache efficiency in force computations and neighbor list building.  For
 that atoms are spatially binned and then reordered so that atoms in the
 same bin are adjacent in the vector.  Atom sorting can be disabled or
 its settings modified with the :doc:`atom_modify <atom_modify>` command.
 .. _neighbor-stencil:
 .. figure:: img/neigh-stencil.png
   :align: center
   neighbor list stencils
   A 2d simulation sub-domain (thick black line) and the corresponding
   ghost atom cutoff region (dashed blue line) for both orthogonal
   (left) and triclinic (right) domains.  A regular grid of neighbor
   bins (thin lines) overlays the entire simulation domain and need not
   align with sub-domain boundaries; only the portion overlapping the
   augmented sub-domain is shown.  In the triclinic case it overlaps the
   bounding box of the tilted rectangle.  The blue- and red-shaded bins
   represent a stencil of bins searched to find neighbors of a particular
   atom (black dot).
 To build a local neighbor list in linear time, the simulation domain is
 overlaid (conceptually) with a regular 3d (or 2d) grid of neighbor bins,
 as shown in the :ref:`neighbor-stencil` figure for 2d models and a
 single MPI processor's sub-domain.  Each processor stores a set of
 neighbor bins which overlap its sub-domain extended by the neighbor
 cutoff distance :math:`R_n`.  As illustrated, the bins need not align
 with processor boundaries; an integer number in each dimension is fit to
 the size of the entire simulation box.
 Most often LAMMPS builds what it calls a "half" neighbor list where
 each *i,j* neighbor pair is stored only once, with either atom *i* or
 *j* as the central atom.  The build can be done efficiently by using a
 pre-computed "stencil" of bins around a central origin bin which
 contains the atom whose neighbors are being searched for.  A stencil
 is simply a list of integer offsets in *x,y,z* of nearby bins
 surrounding the origin bin which are close enough to contain any
 neighbor atom *j* within a distance :math:`R_n` from any atom *i* in the
 origin bin.  Note that for a half neighbor list, the stencil can be
 asymmetric since each atom only need store half its nearby neighbors.
 These stencils are illustrated in the figure for a half list and a bin
 size of :math:`\frac{1}{2} R_n`.  There are 13 red+blue stencil bins in
 2d (for the orthogonal case, 15 for triclinic).  In 3d there would be
 63, 13 in the plane of bins that contain the origin bin and 25 in each
 of the two planes above it in the *z* direction (75 for triclinic).  The
 reason the triclinic stencil has extra bins is because the bins tile the
 bounding box of the entire triclinic domain and thus are not periodic
 with respect to the simulation box itself.  The stencil and logic for
 determining which *i,j* pairs to include in the neighbor list are
 altered slightly to account for this.
 To build a neighbor list, a processor first loops over its "owned" plus
 "ghost" atoms and assigns each to a neighbor bin.  This uses an integer
 vector to create a linked list of atom indices within each bin.  It then
 performs a triply-nested loop over its owned atoms *i*, the stencil of
 bins surrounding atom *i*'s bin, and the *j* atoms in each stencil bin
 (including ghost atoms).  If the distance :math:`r_{ij} < R_n`, then
 atom *j* is added to the vector of atom *i*'s neighbors.
 Here are additional details about neighbor list build options LAMMPS
 supports:
 - The choice of bin size is an option; a size half of :math:`R_n` has
  been found to be optimal for many typical cases.  Smaller bins incur
  additional overhead to loop over; larger bins require more distance
  calculations.  Note that for smaller bin sizes, the 2d stencil in the
  figure would be more semi-circular in shape (hemispherical in 3d),
  with bins near the corners of the square eliminated due to their
  distance from the origin bin.
 - Depending on the interatomic potential(s) and other commands used in
  an input script, multiple neighbor lists and stencils with different
  attributes may be needed.  This includes lists with different cutoff
  distances, e.g. for force computation versus occasional diagnostic
  computations such as a radial distribution function, or for the
  r-RESPA time integrator which can partition pairwise forces by
  distance into subsets computed at different time intervals.  It
  includes "full" lists (as opposed to half lists) where each *i,j* pair
  appears twice, stored once with *i* and *j*, and which use a larger
  symmetric stencil.  It also includes lists with partial enumeration of
  ghost atom neighbors.  The full and ghost-atom lists are used by
  various manybody interatomic potentials.  Lists may also use different
  criteria for inclusion of a pair interaction.  Typically this simply
  depends only on the distance between two atoms and the cutoff
  distance.  But for finite-size coarse-grained particles with
  individual diameters (e.g. polydisperse granular particles), it can
  also depend on the diameters of the two particles.
 - When using :doc:`pair style hybrid <pair_hybrid>` multiple sub-lists
  of the master neighbor list for the full system need to be generated,
  one for each sub-style, which contains only the *i,j* pairs needed to
  compute interactions between subsets of atoms for the corresponding
  potential.  This means not all *i* or *j* atoms owned by a processor
  are included in a particular sub-list.
 - Some models use different cutoff lengths for pairwise interactions
  between different kinds of particles which are stored in a single
  neighbor list.  One example is a solvated colloidal system with large
  colloidal particles where colloid/colloid, colloid/solvent, and
  solvent/solvent interaction cutoffs can be dramatically different.
  Another is a model of polydisperse finite-size granular particles;
  pairs of particles interact only when they are in contact with each
  other.  Mixtures with particle size ratios as high as 10-100x may be
  used to model realistic systems.  Efficient neighbor list building
  algorithms for these kinds of systems are available in LAMMPS.  They
  include a method which uses different stencils for different cutoff
  lengths and trims the stencil to only include bins that straddle the
  cutoff sphere surface.  More recently a method which uses both
  multiple stencils and multiple bin sizes was developed; it builds
  neighbor lists efficiently for systems with particles of any size
  ratio, though other considerations (timestep size, force computations)
  may limit the ability to model systems with huge polydispersity.
 - For small and sparse systems and as a fallback method, LAMMPS also
  supports neighbor list construction without binning by using a full
  :math:`O(N^2)` loop over all *i,j* atom pairs in a sub-domain when
  using the :doc:`neighbor nsq <neighbor>` command.
 - Dependent on the "pair" setting of the :doc:`newton <newton>` command,
  the "half" neighbor lists may contain **all** pairs of atoms where
  atom *j* is a ghost atom (i.e. when the newton pair setting is *off*)
  For the newton pair *on* setting the atom *j* is only added to the
  list if its *z* coordinate is larger, or if equal the *y* coordinate
  is larger, and that is equal, too, the *x* coordinate is larger.  For
  homogeneously dense systems that will result in picking neighbors from
  a same size sector in always the same direction relative to the
  "owned" atom and thus it should lead to similar length neighbor lists
  and thus reduce the chance of a load imbalance.
--- a/doc/src/Developer_par_openmp.rst
+++ b/doc/src/Developer_par_openmp.rst
@ -0,0 +1,114 @@
 OpenMP Parallelism
 ^^^^^^^^^^^^^^^^^^
 The styles in the INTEL, KOKKOS, and OPENMP package offer to use OpenMP
 thread parallelism to predominantly distribute loops over local data
 and thus follow an orthogonal parallelization strategy to the
 decomposition into spatial domains used by the :doc:`MPI partitioning
 <Developer_par_part>`.  For clarity, this section discusses only the
 implementation in the OPENMP package as it is the simplest. The INTEL
 and KOKKOS package offer additional options and are more complex since
 they support more features and different hardware like co-processors
 or GPUs.
 One of the key decisions when implementing the OPENMP package was to
 keep the changes to the source code small, so that it would be easier to
 maintain the code and keep it in sync with the non-threaded standard
 implementation.  this is achieved by a) making the OPENMP version a
 derived class from the regular version (e.g. ``PairLJCutOMP`` from
 ``PairLJCut``) and overriding only methods that are multi-threaded or
 need to be modified to support multi-threading (similar to what was done
 in the OPT package), b) keeping the structure in the modified code very
 similar so that side-by-side comparisons are still useful, and c)
 offloading additional functionality and multi-thread support functions
 into three separate classes ``ThrOMP``, ``ThrData``, and ``FixOMP``.
 ``ThrOMP`` provides additional, multi-thread aware functionality not
 available in the corresponding base class (e.g. ``Pair`` for
 ``PairLJCutOMP``) like multi-thread aware variants of the "tally"
 functions. Those functions are made available through multiple
 inheritance so those new functions have to have unique names to avoid
 ambiguities; typically ``_thr`` is appended to the name of the function.
 ``ThrData`` is a classes that manages per-thread data structures.
 It is used instead of extending the corresponding storage to per-thread
 arrays to avoid slowdowns due to "false sharing" when multiple threads
 update adjacent elements in an array and thus force the CPU cache lines
 to be reset and re-fetched.  ``FixOMP`` finally manages the "multi-thread
 state" like settings and access to per-thread storage, it is activated
 by the :doc:`package omp <package>` command.
 Avoiding data races
 """""""""""""""""""
 A key problem when implementing thread parallelism in an MD code is
 to avoid data races when updating accumulated properties like forces,
 energies, and stresses.  When interactions are computed, they always
 involve multiple atoms and thus there are race conditions when multiple
 threads want to update per-atom data of the same atoms.  Five possible
 strategies have been considered to avoid this:
 1) restructure the code so that there is no overlapping access possible
   when computing in parallel, e.g. by breaking lists into multiple
   parts and synchronizing threads in between.
 2) have each thread be "responsible" for a specific group of atoms and
   compute these interactions multiple times, once on each thread that
   is responsible for a given atom and then have each thread only update
   the properties of this atom.
 3) use mutexes around functions and regions of code where the data race
   could happen
 4) use atomic operations when updating per-atom properties
 5) use replicated per-thread data structures to accumulate data without
   conflicts and then use a reduction to combine those results into the
   data structures used by the regular style.
 Option 5 was chosen for the OPENMP package because it would retain the
 performance for the case of 1 thread and the code would be more
 maintainable.  Option 1 would require extensive code changes,
 particularly to the neighbor list code; options 2 would have incurred a
 2x or more performance penalty for the serial case; option 3 causes
 significant overhead and would enforce serialization of operations in
 inner loops and thus defeat the purpose of multi-threading; option 4
 slows down the serial case although not quite as bad as option 2.  The
 downside of option 5 is that the overhead of the reduction operations
 grows with the number of threads used, so there would be a crossover
 point where options 2 or 4 would result in faster executing.  That is
 why option 2 for example is used in the GPU package because a GPU is a
 processor with a massive number of threads.  However, since the MPI
 parallelization is generally more effective for typical MD systems, the
 expectation is that thread parallelism is only used for a smaller number
 of threads (2-8).  At the time of its implementation, that number was
 equivalent to the number of CPU cores per CPU socket on high-end
 supercomputers.
 Thus arrays like the force array are dimensioned to the number of atoms
 times the number of threads when enabling OpenMP support and inside the
 compute functions a pointer to a different chunk is obtained by each thread.
 Similarly, accumulators like potential energy or virial are kept in
 per-thread instances of the ``ThrData`` class and then only reduced and
 stored in their global counterparts at the end of the force computation.
 Loop scheduling
 """""""""""""""
 Multi-thread parallelization is applied by distributing (outer) loops
 statically across threads.  Typically this would be the loop over local
 atoms *i* when processing *i,j* pairs of atoms from a neighbor list.
 The design of the neighbor list code results in atoms having a similar
 number of neighbors for homogeneous systems and thus load imbalances
 across threads are not common and typically happen for systems where
 also the MPI parallelization would be unbalanced, which would typically
 have a more pronounced impact on the performance.  This same loop
 scheduling scheme can also be applied to the reduction operations on
 per-atom data to try and reduce the overhead of the reduction operation.
 Neighbor list parallelization
 """""""""""""""""""""""""""""
 In addition to the parallelization of force computations, also the
 generation of the neighbor lists is parallelized.  As explained
 previously, neighbor lists are built by looping over "owned" atoms and
 storing the neighbors in "pages".  In the OPENMP variants of the
 neighbor list code, each thread operates on a different chunk of "owned"
 atoms and allocates and fills its own set of pages with neighbor list
 data.  This is achieved by each thread keeping its own instance of the
 :cpp:class:`MyPage <LAMMPS_NS::MyPage>` page allocator class.
--- a/doc/src/Developer_par_part.rst
+++ b/doc/src/Developer_par_part.rst
@ -0,0 +1,89 @@
 Partitioning
 ^^^^^^^^^^^^
 The underlying spatial decomposition strategy used by LAMMPS for
 distributed-memory parallelism is set with the :doc:`comm_style command
 <comm_style>` and can be either "brick" (a regular grid) or "tiled".
 .. _domain-decomposition:
 .. figure:: img/domain-decomp.png
   :align: center
   domain decomposition
   This figure shows the different kinds of domain decomposition used
   for MPI parallelization: "brick" on the left with an orthogonal
   (left) and a triclinic (middle) simulation domain, and a "tiled"
   decomposition (right).  The black lines show the division into
   sub-domains and the contained atoms are "owned" by the corresponding
   MPI process. The green dashed lines indicate how sub-domains are
   extended with "ghost" atoms up to the communication cutoff distance.
 The LAMMPS simulation box is a 3d or 2d volume, which can be orthogonal
 or triclinic in shape, as illustrated in the :ref:`domain-decomposition`
 figure for the 2d case.  Orthogonal means the box edges are aligned with
 the *x*, *y*, *z* Cartesian axes, and the box faces are thus all
 rectangular.  Triclinic allows for a more general parallelepiped shape
 in which edges are aligned with three arbitrary vectors and the box
 faces are parallelograms.  In each dimension box faces can be periodic,
 or non-periodic with fixed or shrink-wrapped boundaries.  In the fixed
 case, atoms which move outside the face are deleted; shrink-wrapped
 means the position of the box face adjusts continuously to enclose all
 the atoms.
 For distributed-memory MPI parallelism, the simulation box is spatially
 decomposed (partitioned) into non-overlapping sub-domains which fill the
 box. The default partitioning, "brick", is most suitable when atom
 density is roughly uniform, as shown in the left-side images of the
 :ref:`domain-decomposition` figure.  The sub-domains comprise a regular
 grid and all sub-domains are identical in size and shape.  Both the
 orthogonal and triclinic boxes can deform continuously during a
 simulation, e.g. to compress a solid or shear a liquid, in which case
 the processor sub-domains likewise deform.
 For models with non-uniform density, the number of particles per
 processor can be load-imbalanced with the default partitioning.  This
 reduces parallel efficiency, as the overall simulation rate is limited
 by the slowest processor, i.e. the one with the largest computational
 load.  For such models, LAMMPS supports multiple strategies to reduce
 the load imbalance:
 - The processor grid decomposition is by default based on the simulation
  cell volume and tries to optimize the volume to surface ratio for the sub-domains.
  This can be changed with the :doc:`processors command <processors>`.
 - The parallel planes defining the size of the sub-domains can be shifted
  with the :doc:`balance command <balance>`. Which can be done in addition
  to choosing a more optimal processor grid.
 - The recursive bisectioning algorithm in combination with the "tiled"
  communication style can produce a partitioning with equal numbers of
  particles in each sub-domain.
 .. |decomp1| image:: img/decomp-regular.png
   :width: 24%
 .. |decomp2| image:: img/decomp-processors.png
   :width: 24%
 .. |decomp3| image:: img/decomp-balance.png
   :width: 24%
 .. |decomp4| image:: img/decomp-rcb.png
   :width: 24%
 |decomp1|  |decomp2|  |decomp3|  |decomp4|
 The pictures above demonstrate different decompositions for a 2d system
 with 12 MPI ranks.  The atom colors indicate the load imbalance of each
 sub-domain with green being optimal and red the least optimal.
 Due to the vacuum in the system, the default decomposition is unbalanced
 with several MPI ranks without atoms (left). By forcing a 1x12x1
 processor grid, every MPI rank does computations now, but number of
 atoms per sub-domain is still uneven and the thin slice shape increases
 the amount of communication between sub-domains (center left). With a
 2x6x1 processor grid and shifting the sub-domain divisions, the load
 imbalance is further reduced and the amount of communication required
 between sub-domains is less (center right).  And using the recursive
 bisectioning leads to further improved decomposition (right).
--- a/doc/src/Developer_parallel.rst
+++ b/doc/src/Developer_parallel.rst
@ -0,0 +1,28 @@
 Parallel algorithms
 -------------------
 LAMMPS is designed to enable running simulations in parallel using the
 MPI parallel communication standard with distributed data via domain
 decomposition.  The parallelization aims to be efficient result in good
 strong scaling (= good speedup for the same system) and good weak
 scaling (= the computational cost of enlarging the system is
 proportional to the system size).  Additional parallelization using GPUs
 or OpenMP can also be applied within the sub-domain assigned to an MPI
 process.  For clarity, most of the following illustrations show the 2d
 simulation case. The underlying algorithms in those cases, however,
 apply to both 2d and 3d cases equally well.
 .. note::
   The text and most of the figures in this chapter were adapted
   for the manual from the section on parallel algorithms in the
   :ref:`new LAMMPS paper <lammps_paper>`.
 .. toctree::
   :maxdepth: 1
   Developer_par_part
   Developer_par_comm
   Developer_par_neigh
   Developer_par_long
   Developer_par_openmp
--- a/doc/src/Developer_utils.rst
+++ b/doc/src/Developer_utils.rst
@ -60,6 +60,9 @@ silently returning the result of a partial conversion or zero in cases
 where the string is not a valid number.  This behavior allows to more
 easily detect typos or issues when processing input files.
 Similarly the :cpp:func:`logical() <LAMMPS_NS::utils::logical>` function
 will convert a string into a boolean and will only accept certain words.
 The *do_abort* flag should be set to ``true`` in case  this function
 is called only on a single MPI rank, as that will then trigger the
 a call to ``Error::one()`` for errors instead of ``Error::all()``
@ -83,6 +86,9 @@ strings for compliance without conversion.
 .. doxygenfunction:: tnumeric
   :project: progguide
 .. doxygenfunction:: logical
   :project: progguide
 String processing
 ^^^^^^^^^^^^^^^^^
@ -203,6 +209,9 @@ Convenience functions
 .. doxygenfunction:: date2num
   :project: progguide
 .. doxygenfunction:: current_date
   :project: progguide
 Customized standard functions
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
--- a/doc/src/Errors_debug.rst
+++ b/doc/src/Errors_debug.rst
@ -40,11 +40,10 @@ We use it to show how to identify the origin of a segmentation fault.
 After recompiling LAMMPS and running the input you should get something like this:
-.. code-block:
+.. code-block::
   $ ./lmp -in in.melt
   LAMMPS (19 Mar 2020)
   OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:94)
     using 1 OpenMP thread(s) per MPI task
   Lattice spacing in x,y,z = 1.6796 1.6796 1.6796
   Created orthogonal box = (0 0 0) to (16.796 16.796 16.796)
--- a/doc/src/Intro_citing.rst
+++ b/doc/src/Intro_citing.rst
@ -4,28 +4,41 @@ Citing LAMMPS
 Core Algorithms
 ^^^^^^^^^^^^^^^
-Since LAMMPS is a community project, there is not a single one
+The paper mentioned below is the best overview of LAMMPS, but there are
-publication or reference that describes **all** of LAMMPS.
+also publications describing particular models or algorithms implemented
-The canonical publication that describes the foundation, that is
+in LAMMPS or complementary software that is has interfaces to.  Please
-the basic spatial decomposition approach, the neighbor finding,
+see below for how to cite contributions to LAMMPS.
-and basic communications algorithms used in LAMMPS is:
+
 .. _lammps_paper:
 The latest canonical publication that describes the basic features, the
 source code design, the program structure, the spatial decomposition
 approach, the neighbor finding, basic communications algorithms, and how
 users and developers have contributed to LAMMPS is:
  `LAMMPS - A flexible simulation tool for particle-based materials modeling at the atomic, meso, and continuum scales, Comp. Phys. Comm. (accepted 09/2021), DOI:10.1016/j.cpc.2021.108171 <https://doi.org/10.1016/j.cpc.2021.108171>`_
 So a project using LAMMPS or a derivative application that uses LAMMPS
 as a simulation engine should cite this paper.  The paper is expected to
 be published in its final form under the same DOI in the first half
 of 2022.  Please also give the URL of the LAMMPS website in your paper,
 namely https://www.lammps.org.
 The original publication describing the parallel algorithms used in the
 initial versions of LAMMPS is:
  `S. Plimpton, Fast Parallel Algorithms for Short-Range Molecular Dynamics, J Comp Phys, 117, 1-19 (1995). <http://www.sandia.gov/~sjplimp/papers/jcompphys95.pdf>`_
 So any project using LAMMPS (or a derivative application using LAMMPS as
 a simulation engine) should cite this paper. A new publication
 describing the developments and improvements of LAMMPS in the 25 years
 since then is currently in preparation.
 DOI for the LAMMPS code
 ^^^^^^^^^^^^^^^^^^^^^^^
-LAMMPS developers use the `Zenodo service at CERN
+LAMMPS developers use the `Zenodo service at CERN <https://zenodo.org/>`_
-<https://zenodo.org/>`_ to create digital object identifies (DOI) for
+to create digital object identifies (DOI) for stable releases of the
-stable releases of the LAMMPS code. There are two types of DOIs for the
+LAMMPS source code. There are two types of DOIs for the LAMMPS source code.
-LAMMPS source code: the canonical DOI for **all** versions of LAMMPS,
+
-which will always point to the **latest** stable release version is:
+The canonical DOI for **all** versions of LAMMPS, which will always
 point to the **latest** stable release version is:
 - DOI: `10.5281/zenodo.3726416 <https://dx.doi.org/10.5281/zenodo.3726416>`_
@ -45,11 +58,13 @@ about LAMMPS and its features.
 Citing contributions
 ^^^^^^^^^^^^^^^^^^^^
-LAMMPS has many features and that use either previously published
+LAMMPS has many features that use either previously published methods
-methods and algorithms or novel features.  It also includes potential
+and algorithms or novel features.  It also includes potential parameter
-parameter filed for specific models.  Where available, a reminder about
+files for specific models.  Where available, a reminder about references
-references for optional features used in a specific run is printed to
+for optional features used in a specific run is printed to the screen
-the screen and log file.  Style and output location can be selected with
+and log file.  Style and output location can be selected with the
-the :ref:`-cite command-line switch <cite>`.  Additional references are
+:ref:`-cite command-line switch <cite>`.  Additional references are
 given in the documentation of the :doc:`corresponding commands
-<Commands_all>` or in the :doc:`Howto tutorials <Howto>`.
+<Commands_all>` or in the :doc:`Howto tutorials <Howto>`.  So please
 make certain, that you provide the proper acknowledgments and citations
 in any published works using LAMMPS.
--- a/doc/src/Intro_website.rst
+++ b/doc/src/Intro_website.rst
@ -26,7 +26,7 @@ available online are listed below.
 * `Tutorials <https://www.lammps.org/tutorials.html>`_
 * `Pre- and post-processing tools for LAMMPS <https://www.lammps.org/prepost.html>`_
-* `Other software usable with LAMMPS <https://www.lammps.org/offsite.html>`_
+* `Other software usable with LAMMPS <https://www.lammps.org/external.html>`_
 * `Viz tools usable with LAMMPS <https://www.lammps.org/viz.html>`_
 * `Benchmark performance <https://www.lammps.org/bench.html>`_
--- a/doc/src/Library_create.rst
+++ b/doc/src/Library_create.rst
@ -34,7 +34,7 @@ simple example demonstrating its use:
     int lmpargc = sizeof(lmpargv)/sizeof(const char *);
     /* create LAMMPS instance */
-     handle = lammps_open_no_mpi(lmpargc, lmpargv, NULL);
+     handle = lammps_open_no_mpi(lmpargc, (char **)lmpargv, NULL);
     if (handle == NULL) {
       printf("LAMMPS initialization failed");
       lammps_mpi_finalize();
--- a/doc/src/Modify_contribute.rst
+++ b/doc/src/Modify_contribute.rst
@ -115,8 +115,8 @@ External contributions
 If you prefer to do so, you can also develop and support your add-on
 feature **without** having it included in the LAMMPS distribution, for
-example as a download from a website of your own.  See the `Offsite
+example as a download from a website of your own.  See the `External
-LAMMPS packages and tools <https://www.lammps.org/offsite.html>`_ page
+LAMMPS packages and tools <https://www.lammps.org/external.html>`_ page
 of the LAMMPS website for examples of groups that do this.  We are happy
 to advertise your package and website from that page.  Simply email the
 `developers <https://www.lammps.org/authors.html>`_ with info about your
--- a/doc/src/Modify_style.rst
+++ b/doc/src/Modify_style.rst
@ -305,19 +305,22 @@ you are uncertain, please ask.
  FILE pointers and only be done on MPI rank 0.  Use the :cpp:func:`utils::logmesg`
  convenience function where possible.
- header files should only include the absolute minimum number of
+- Header files, especially those defining a "style", should only use
-  include files and **must not** contain any ``using`` statements;
+  the absolute minimum number of include files and **must not** contain
-  rather the include statements should be put into the corresponding
+  any ``using`` statements. Typically that would be only the header for
-  implementation files. For implementation files, the
+  the base class. Instead any include statements should be put into the
-  "include-what-you-use" principle should be employed.  However, when
+  corresponding implementation files and forward declarations be used.
-  including the ``pointers.h`` header (or one of the base classes
+  For implementation files, the "include what you use" principle should
-  derived from it) certain headers will be included and thus need to be
+  be employed.  However, there is the notable exception that when the
-  specified. These are: `mpi.h`, `cstddef`, `cstdio`, `cstdlib`,
+  ``pointers.h`` header is included (or one of the base classes derived
-  `string`, `utils.h`, `fmt/format.h`, `climits`, `cinttypes`. This also
+  from it) certain headers will always be included and thus do not need
-  means any header can assume that `FILE`, `NULL`, and `INT_MAX` are
+  to be explicitly specified.
-  defined.
+  These are: `mpi.h`, `cstddef`, `cstdio`, `cstdlib`, `string`, `utils.h`,
  `vector`, `fmt/format.h`, `climits`, `cinttypes`.
  This also means any such file can assume that `FILE`, `NULL`, and
  `INT_MAX` are defined.
- header files that define a new LAMMPS style (i.e. that have a
+- Header files that define a new LAMMPS style (i.e. that have a
  ``SomeStyle(some/name,SomeName);`` macro in them) should only use the
  include file for the base class and otherwise use forward declarations
  and pointers; when interfacing to a library use the PIMPL (pointer
@ -325,7 +328,7 @@ you are uncertain, please ask.
  that contains all library specific data (and thus requires the library
  header) but use a forward declaration and define the struct only in
  the implementation file. This is a **strict** requirement since this
-  is where type clashes between packages and hard to fine bugs have
+  is where type clashes between packages and hard to find bugs have
  regularly manifested in the past.
 - Please use clang-format only to reformat files that you have
--- a/doc/src/PDF/colvars-refman-lammps.pdf
+++ b/doc/src/PDF/colvars-refman-lammps.pdf
--- a/doc/src/Run_basics.rst
+++ b/doc/src/Run_basics.rst
@ -2,17 +2,25 @@ Basics of running LAMMPS
 ========================
 LAMMPS is run from the command line, reading commands from a file via
-the -in command line flag, or from standard input.
+the -in command line flag, or from standard input.  Using the "-in
-Using the "-in in.file" variant is recommended:
+in.file" variant is recommended (see note below).  The name of the
 LAMMPS executable is either ``lmp`` or ``lmp_<machine>`` with
 `<machine>` being the machine string used when compiling LAMMPS.  This
 is required when compiling LAMMPS with the traditional build system
 (e.g. with ``make mpi``), but optional when using CMake to configure and
 build LAMMPS:
 .. code-block:: bash
   $ lmp_serial -in in.file
   $ lmp_serial < in.file
   $ lmp -in in.file
   $ lmp < in.file
   $ /path/to/lammps/src/lmp_serial -i in.file
   $ mpirun -np 4 lmp_mpi -in in.file
   $ mpiexec -np 4 lmp -in in.file
   $ mpirun -np 8 /path/to/lammps/src/lmp_mpi -in in.file
-   $ mpirun -np 6 /usr/local/bin/lmp -in in.file
+   $ mpiexec -n 6 /usr/local/bin/lmp -in in.file
 You normally run the LAMMPS command in the directory where your input
 script is located.  That is also where output files are produced by
@ -23,7 +31,7 @@ executable itself can be placed elsewhere.
 .. note::
   The redirection operator "<" will not always work when running
-   in parallel with mpirun; for those systems the -in form is required.
+   in parallel with mpirun or mpiexec; for those systems the -in form is required.
 As LAMMPS runs it prints info to the screen and a logfile named
 *log.lammps*\ .  More info about output is given on the
--- a/doc/src/Tools.rst
+++ b/doc/src/Tools.rst
@ -7,7 +7,7 @@ steps are often necessary to setup and analyze a simulation.  A list
 of such tools can be found on the `LAMMPS webpage <lws_>`_ at these links:
 * `Pre/Post processing <https://www.lammps.org/prepost.html>`_
-* `Offsite LAMMPS packages & tools <https://www.lammps.org/offsite.html>`_
+* `External LAMMPS packages & tools <https://www.lammps.org/external.html>`_
 * `Pizza.py toolkit <pizza_>`_
 The last link for `Pizza.py <pizza_>`_ is a Python-based tool developed at
--- a/doc/src/fix_brownian.rst
+++ b/doc/src/fix_brownian.rst
@ -8,9 +8,8 @@ fix brownian command
 fix brownian/sphere command
 ===========================
-fix brownian/sphere command
+fix brownian/asphere command
-===========================
+============================
 Syntax
 """"""
--- a/doc/src/group.rst
+++ b/doc/src/group.rst
@ -38,7 +38,7 @@ Syntax
       *intersect* args = two or more group IDs
       *dynamic* args = parent-ID keyword value ...
         one or more keyword/value pairs may be appended
-         keyword = *region* or *var* or *every*
+         keyword = *region* or *var* or *property* or *every*
           *region* value = region-ID
           *var* value = name of variable
           *property* value = name of custom integer or floating point vector
--- a/doc/src/img/decomp-balance.png
+++ b/doc/src/img/decomp-balance.png
--- a/doc/src/img/decomp-processors.png
+++ b/doc/src/img/decomp-processors.png
--- a/doc/src/img/decomp-rcb.png
+++ b/doc/src/img/decomp-rcb.png
--- a/doc/src/img/decomp-regular.png
+++ b/doc/src/img/decomp-regular.png
--- a/doc/src/img/domain-decomp.png
+++ b/doc/src/img/domain-decomp.png
--- a/doc/src/img/fft-decomp-parallel.png
+++ b/doc/src/img/fft-decomp-parallel.png
--- a/doc/src/img/ghost-comm.png
+++ b/doc/src/img/ghost-comm.png
--- a/doc/src/img/neigh-stencil.png
+++ b/doc/src/img/neigh-stencil.png
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@ -27,7 +27,7 @@ Syntax
             on = set Newton pairwise flag on (currently not allowed)
           *pair/only* = *off* or *on*
             off = apply "gpu" suffix to all available styles in the GPU package (default)
-             on  - apply "gpu" suffix only pair styles
+             on  = apply "gpu" suffix only pair styles
           *binsize* value = size
             size = bin size for neighbor list construction (distance units)
           *split* = fraction
--- a/doc/src/pair_hybrid.rst
+++ b/doc/src/pair_hybrid.rst
@ -198,8 +198,8 @@ same:
 Coefficients must be defined for each pair of atoms types via the
 :doc:`pair_coeff <pair_coeff>` command as described above, or in the
-data file read by the :doc:`read_data <read_data>` commands, or by
+"Pair Coeffs" or "PairIJ Coeffs" section of the data file read by the
-mixing as described below.
+:doc:`read_data <read_data>` command, or by mixing as described below.
 For all of the *hybrid*, *hybrid/overlay*, and *hybrid/scaled* styles,
 every atom type pair I,J (where I <= J) must be assigned to at least one
@ -208,14 +208,21 @@ examples above, or in the data file read by the :doc:`read_data
 <read_data>`, or by mixing as described below.  Also all sub-styles
 must be used at least once in a :doc:`pair_coeff <pair_coeff>` command.
 .. note::
   LAMMPS never performs mixing of parameters from different sub-styles,
   **even** if they use the same type of coefficients, e.g. contain
   a Lennard-Jones potential variant.  Those parameters must be provided
   explicitly.
 If you want there to be no interactions between a particular pair of
-atom types, you have 3 choices.  You can assign the type pair to some
+atom types, you have 3 choices.  You can assign the pair of atom types
-sub-style and use the :doc:`neigh_modify exclude type <neigh_modify>`
+to some sub-style and use the :doc:`neigh_modify exclude type <neigh_modify>`
 command.  You can assign it to some sub-style and set the coefficients
 so that there is effectively no interaction (e.g. epsilon = 0.0 in a LJ
 potential).  Or, for *hybrid*, *hybrid/overlay*, or *hybrid/scaled*
 simulations, you can use this form of the pair_coeff command in your
-input script:
+input script or the "PairIJ Coeffs" section of your data file:
 .. code-block:: LAMMPS
@ -238,19 +245,20 @@ styles with different requirements.
 ----------
-Different force fields (e.g. CHARMM vs AMBER) may have different rules
+Different force fields (e.g. CHARMM vs. AMBER) may have different rules
-for applying weightings that change the strength of pairwise
+for applying exclusions or weights that change the strength of pairwise
-interactions between pairs of atoms that are also 1-2, 1-3, and 1-4
+non-bonded interactions between pairs of atoms that are also 1-2, 1-3,
-neighbors in the molecular bond topology, as normally set by the
+and 1-4 neighbors in the molecular bond topology. This is normally a
-:doc:`special_bonds <special_bonds>` command.  Different weights can be
+global setting defined the :doc:`special_bonds <special_bonds>` command.
-assigned to different pair hybrid sub-styles via the :doc:`pair_modify
+However, different weights can be assigned to different hybrid
-special <pair_modify>` command. This allows multiple force fields to be
+sub-styles via the :doc:`pair_modify special <pair_modify>` command.
-used in a model of a hybrid system, however, there is no consistent
+This allows multiple force fields to be used in a model of a hybrid
-approach to determine parameters automatically for the interactions
+system, however, there is no consistent approach to determine parameters
-between the two force fields, this is only recommended when particles
+automatically for the interactions **between** atoms of the two force
 fields, thus this approach this is only recommended when particles
 described by the different force fields do not mix.
-Here is an example for mixing CHARMM and AMBER: The global *amber*
+Here is an example for combining CHARMM and AMBER: The global *amber*
 setting sets the 1-4 interactions to non-zero scaling factors and
 then overrides them with 0.0 only for CHARMM:
@ -260,7 +268,7 @@ then overrides them with 0.0 only for CHARMM:
   pair_style hybrid lj/charmm/coul/long 8.0 10.0 lj/cut/coul/long 10.0
   pair_modify pair lj/charmm/coul/long special lj/coul 0.0 0.0 0.0
-The this input achieves the same effect:
+This input achieves the same effect:
 .. code-block:: LAMMPS
@ -270,9 +278,9 @@ The this input achieves the same effect:
   pair_modify pair lj/cut/coul/long special coul 0.0 0.0 0.83333333
   pair_modify pair lj/charmm/coul/long special lj/coul 0.0 0.0 0.0
-Here is an example for mixing Tersoff with OPLS/AA based on
+Here is an example for combining Tersoff with OPLS/AA based on
-a data file that defines bonds for all atoms where for the
+a data file that defines bonds for all atoms where - for the
-Tersoff part of the system the force constants for the bonded
+Tersoff part of the system - the force constants for the bonded
 interactions have been set to 0. Note the global settings are
 effectively *lj/coul 0.0 0.0 0.5* as required for OPLS/AA:
--- a/doc/src/read_data.rst
+++ b/doc/src/read_data.rst
@ -619,7 +619,7 @@ of analysis.
   * - bond
     - atom-ID molecule-ID atom-type x y z
   * - charge
-     - atom-type q x y z
+     - atom-ID atom-type q x y z
   * - dipole
     - atom-ID atom-type q x y z mux muy muz
   * - dpd
--- a/doc/utils/requirements.txt
+++ b/doc/utils/requirements.txt
@ -1,7 +1,7 @@
 Sphinx==4.0.3
-sphinxcontrib-spelling
+sphinxcontrib-spelling==7.2.1
 git+git://github.com/akohlmey/sphinx-fortran@parallel-read
-sphinx_tabs
+sphinx_tabs==3.2.0
-breathe
+breathe==4.31.0
-Pygments
+Pygments==2.10.0
-six
+six==1.16.0
--- a/doc/utils/sphinx-config/conf.py.in
+++ b/doc/utils/sphinx-config/conf.py.in
@ -418,6 +418,7 @@ html_context['current_version'] = os.environ.get('LAMMPS_WEBSITE_BUILD_VERSION',
 html_context['git_commit'] = git_commit
 html_context['versions'] = [
  ('latest', 'https://docs.lammps.org/latest/'),
  ('stable', 'https://docs.lammps.org/stable/'),
  (version, 'https://docs.lammps.org/')
 ]
 html_context['downloads'] = [('PDF', 'Manual.pdf')]
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@ -2265,6 +2265,7 @@ Nmols
 nn
 nnodes
 Nocedal
 nO
 nocite
 nocoeff
 nodeless
@ -2443,6 +2444,7 @@ packings
 padua
 Padua
 pafi
 PairIJ
 palegoldenrod
 palegreen
 paleturquoise
@ -3662,6 +3664,7 @@ Yc
 ycm
 Yeh
 yellowgreen
 yEs
 Yethiraj
 yflag
 yhi
--- a/examples/COUPLE/lammps_quest/README
+++ b/examples/COUPLE/lammps_quest/README
@ -1,3 +1,9 @@
 IMPORTANT NOTE: This example has not been updated since 2014,
 so it is not likely to work anymore out of the box.  There have
 been changes to LAMMPS and its library interface that would need
 to be applied. Please see the manual for the documentation of
 the library interface.
 This directory has an application that runs classical MD via LAMMPS,
 but uses quantum forces calculated by the Quest DFT (density
 functional) code in place of the usual classical MD forces calculated
--- a/examples/COUPLE/lammps_spparks/README
+++ b/examples/COUPLE/lammps_spparks/README
@ -1,3 +1,9 @@
 IMPORTANT NOTE: This example has not been updated since 2013,
 so it is not likely to work anymore out of the box.  There have
 been changes to LAMMPS and its library interface that would need
 to be applied. Please see the manual for the documentation of
 the library interface.
 This directory has an application that models grain growth in the
 presence of strain.
--- a/examples/COUPLE/multiple/multiple.cpp
+++ b/examples/COUPLE/multiple/multiple.cpp
@ -28,13 +28,9 @@
 #include <cstdlib>
 #include <cstring>
-#include "lammps.h"         // these are LAMMPS include files
+#define LAMMPS_LIB_MPI  // to make lammps_open() visible
 #include "input.h"
 #include "atom.h"
 #include "library.h"
 using namespace LAMMPS_NS;
 int main(int narg, char **arg)
 {
  // setup MPI and various communicators
@ -74,7 +70,7 @@ int main(int narg, char **arg)
  char str1[32],str2[32],str3[32];
  char **lmparg = new char*[8];
-  lmparg[0] = NULL;                 // required placeholder for program name
+  lmparg[0] = (char *) "LAMMPS";              // required placeholder for program name
  lmparg[1] = (char *) "-screen";
  sprintf(str1,"screen.%d",instance);
  lmparg[2] = str1;
@ -86,13 +82,9 @@ int main(int narg, char **arg)
  sprintf(str3,"%g",temperature + instance*tdelta);
  lmparg[7] = str3;
-  // open N instances of LAMMPS
+  // create N instances of LAMMPS
  // either of these methods will work
-  LAMMPS *lmp = new LAMMPS(8,lmparg,comm_lammps);
+  void *lmp = lammps_open(8,lmparg,comm_lammps,NULL);
  //LAMMPS *lmp;
  //lammps_open(8,lmparg,comm_lammps,(void **) &lmp);
  delete [] lmparg;
@ -103,7 +95,7 @@ int main(int narg, char **arg)
  // query final temperature and print result for each instance
  double *ptr = (double *)
-    lammps_extract_compute(lmp,(char *) "thermo_temp",0,0);
+    lammps_extract_compute(lmp,"thermo_temp",LMP_STYLE_GLOBAL,LMP_TYPE_SCALAR);
  double finaltemp = *ptr;
  double *temps = new double[ninstance];
@ -125,7 +117,7 @@ int main(int narg, char **arg)
  // delete LAMMPS instances
-  delete lmp;
+  lammps_close(lmp);
  // close down MPI
--- a/examples/COUPLE/plugin/README
+++ b/examples/COUPLE/plugin/README
@ -13,7 +13,7 @@ like below.
 mpicc -c -O -Wall -g -I$HOME/lammps/src liblammpsplugin.c
 mpicc -c -O -Wall -g simple.c
-mpicc simple.o liblammsplugin.o -ldl -o simpleC
+mpicc simple.o liblammpsplugin.o -ldl -o simpleC
 You also need to build LAMMPS as a shared library
 (see examples/COUPLE/README), e.g. 
--- a/examples/COUPLE/plugin/liblammpsplugin.c
+++ b/examples/COUPLE/plugin/liblammpsplugin.c
@ -38,44 +38,98 @@ liblammpsplugin_t *liblammpsplugin_load(const char *lib)
 #define ADDSYM(symbol) lmp->symbol = dlsym(handle,"lammps_" #symbol)
  ADDSYM(open);
  ADDSYM(open_no_mpi);
  ADDSYM(open_fortran);
  ADDSYM(close);
-  ADDSYM(version);
+
  ADDSYM(mpi_init);
  ADDSYM(mpi_finalize);
  ADDSYM(kokkos_finalize);
  ADDSYM(python_finalize);
  ADDSYM(file);
  ADDSYM(command);
  ADDSYM(commands_list);
  ADDSYM(commands_string);
-  ADDSYM(free);
+
-  ADDSYM(extract_setting);
+  ADDSYM(get_natoms);
-  ADDSYM(extract_global);
+  ADDSYM(get_thermo);
  ADDSYM(extract_box);
  ADDSYM(reset_box);
  ADDSYM(memory_usage);
  ADDSYM(get_mpi_comm);
  ADDSYM(extract_setting);
  ADDSYM(extract_global_datatype);
  ADDSYM(extract_global);
  ADDSYM(extract_atom_datatype);
  ADDSYM(extract_atom);
  ADDSYM(extract_compute);
  ADDSYM(extract_fix);
  ADDSYM(extract_variable);
  ADDSYM(get_thermo);
  ADDSYM(get_natoms);
  ADDSYM(set_variable);
  ADDSYM(reset_box);
  ADDSYM(gather_atoms);
  ADDSYM(gather_atoms_concat);
  ADDSYM(gather_atoms_subset);
  ADDSYM(scatter_atoms);
  ADDSYM(scatter_atoms_subset);
  ADDSYM(gather_bonds);
-  ADDSYM(set_fix_external_callback);
+  ADDSYM(create_atoms);
-  ADDSYM(config_has_package);
+  ADDSYM(find_pair_neighlist);
-  ADDSYM(config_package_count);
+  ADDSYM(find_fix_neighlist);
-  ADDSYM(config_package_name);
+  ADDSYM(find_compute_neighlist);
  ADDSYM(neighlist_num_elements);
  ADDSYM(neighlist_element_neighbors);
  ADDSYM(version);
  ADDSYM(get_os_info);
  ADDSYM(config_has_mpi_support);
  ADDSYM(config_has_gzip_support);
  ADDSYM(config_has_png_support);
  ADDSYM(config_has_jpeg_support);
  ADDSYM(config_has_ffmpeg_support);
  ADDSYM(config_has_exceptions);
-  ADDSYM(create_atoms);
+
  ADDSYM(config_has_package);
  ADDSYM(config_package_count);
  ADDSYM(config_package_name);
  ADDSYM(config_accelerator);
  ADDSYM(has_gpu_device);
  ADDSYM(get_gpu_device_info);
  ADDSYM(has_style);
  ADDSYM(style_count);
  ADDSYM(style_name);
  ADDSYM(has_id);
  ADDSYM(id_count);
  ADDSYM(id_name);
  ADDSYM(plugin_count);
  ADDSYM(plugin_name);
  ADDSYM(set_fix_external_callback);
  ADDSYM(fix_external_get_force);
  ADDSYM(fix_external_set_energy_global);
  ADDSYM(fix_external_set_energy_peratom);
  ADDSYM(fix_external_set_virial_global);
  ADDSYM(fix_external_set_virial_peratom);
  ADDSYM(fix_external_set_vector_length);
  ADDSYM(fix_external_set_vector);
  ADDSYM(free);
  ADDSYM(is_running);
  ADDSYM(force_timeout);
 #ifdef LAMMPS_EXCEPTIONS
  lmp->has_exceptions = 1;
  ADDSYM(has_error);
--- a/examples/COUPLE/plugin/liblammpsplugin.h
+++ b/examples/COUPLE/plugin/liblammpsplugin.h
@ -39,75 +39,121 @@ extern "C" {
 #if defined(LAMMPS_BIGBIG)
 typedef void (*FixExternalFnPtr)(void *, int64_t, int, int64_t *, double **, double **);
-#elif defined(LAMMPS_SMALLBIG)
+#elif defined(LAMMPS_SMALLSMALL)
 typedef void (*FixExternalFnPtr)(void *, int64_t, int, int *, double **, double **);
 #else
 typedef void (*FixExternalFnPtr)(void *, int, int, int *, double **, double **);
 #else
 typedef void (*FixExternalFnPtr)(void *, int64_t, int, int *, double **, double **);
 #endif
 struct _liblammpsplugin {
  int abiversion;
  int has_exceptions;
  void *handle;
-  void (*open)(int, char **, MPI_Comm, void **);
+  void *(*open)(int, char **, MPI_Comm, void **);
-  void (*open_no_mpi)(int, char **, void **);
+  void *(*open_no_mpi)(int, char **, void **);
  void *(*open_fortran)(int, char **, void **, int);
  void (*close)(void *);
-  int  (*version)(void *);
+
  void (*mpi_init)();
  void (*mpi_finalize)();
  void (*kokkos_finalize)();
  void (*python_finalize)();
  void (*file)(void *, char *);
-  char *(*command)(void *, char *);
+  char *(*command)(void *, const char *);
-  void (*commands_list)(void *, int, char **);
+  void (*commands_list)(void *, int, const char **);
-  void (*commands_string)(void *, char *);
+  void (*commands_string)(void *, const char *);
-  void (*free)(void *);
+
-  int (*extract_setting)(void *, char *);
+  double (*get_natoms)(void *);
-  void *(*extract_global)(void *, char *);
+  double (*get_thermo)(void *, char *);
  void (*extract_box)(void *, double *, double *,
                      double *, double *, double *, int *, int *);
  void *(*extract_atom)(void *, char *);
  void *(*extract_compute)(void *, char *, int, int);
  void *(*extract_fix)(void *, char *, int, int, int, int);
  void *(*extract_variable)(void *, char *, char *);
  double (*get_thermo)(void *, char *);
  int (*get_natoms)(void *);
  int (*set_variable)(void *, char *, char *);
  void (*reset_box)(void *, double *, double *, double, double, double);
  void (*memory_usage)(void *, double *);
  int (*get_mpi_comm)(void *);
  int (*extract_setting)(void *, const char *);
  int *(*extract_global_datatype)(void *, const char *);
  void *(*extract_global)(void *, const char *);
  void *(*extract_atom_datatype)(void *, const char *);
  void *(*extract_atom)(void *, const char *);
  void *(*extract_compute)(void *, const char *, int, int);
  void *(*extract_fix)(void *, const char *, int, int, int, int);
  void *(*extract_variable)(void *, const char *, char *);
  int (*set_variable)(void *, char *, char *);
  void (*gather_atoms)(void *, char *, int, int, void *);
  void (*gather_atoms_concat)(void *, char *, int, int, void *);
  void (*gather_atoms_subset)(void *, char *, int, int, int, int *, void *);
  void (*scatter_atoms)(void *, char *, int, int, void *);
  void (*scatter_atoms_subset)(void *, char *, int, int, int, int *, void *);
-  void (*set_fix_external_callback)(void *, char *, FixExternalFnPtr, void*);
+  void (*gather_bonds)(void *, void *);
-  int (*config_has_package)(char * package_name);
+// lammps_create_atoms() takes tagint and imageint as args
-  int (*config_package_count)();
+// ifdef insures they are compatible with rest of LAMMPS
-  int (*config_package_name)(int index, char * buffer, int max_size);
+// caller must match to how LAMMPS library is built
 #ifndef LAMMPS_BIGBIG
 void (*create_atoms)(void *, int, int *, int *, double *,
                      double *, int *, int);
 #else
  void (*create_atoms)(void *, int, int64_t *, int *, double *,
                       double *, int64_t *, int);
 #endif
  int (*find_pair_neighlist)(void *, const char *, int, int, int);
  int (*find_fix_neighlist)(void *, const char *, int);
  int (*find_compute_neighlist)(void *, char *, int);
  int (*neighlist_num_elements)(void *, int);
  void (*neighlist_element_neighbors)(void *, int, int, int *, int *, int **);
  int (*version)(void *);
  void (*get_os_info)(char *, int);
  int (*config_has_mpi_support)();
  int (*config_has_gzip_support)();
  int (*config_has_png_support)();
  int (*config_has_jpeg_support)();
  int (*config_has_ffmpeg_support)();
  int (*config_has_exceptions)();
-  int (*find_pair_neighlist)(void* ptr, char * style, int exact, int nsub, int request);
+  int (*config_has_package)(const char *);
-  int (*find_fix_neighlist)(void* ptr, char * id, int request);
+  int (*config_package_count)();
-  int (*find_compute_neighlist)(void* ptr, char * id, int request);
+  int (*config_package_name)(int, char *, int);
  int (*neighlist_num_elements)(void* ptr, int idx);
  void (*neighlist_element_neighbors)(void * ptr, int idx, int element, int * iatom, int * numneigh, int ** neighbors);
-// lammps_create_atoms() takes tagint and imageint as args
+  int (*config_accelerator)(const char *, const char *, const char *);
-// ifdef insures they are compatible with rest of LAMMPS
+  int (*has_gpu_device)();
-// caller must match to how LAMMPS library is built
+  void (*get_gpu_device_info)(char *, int);
-#ifdef LAMMPS_BIGBIG
+  int (*has_style)(void *, const char *, const char *);
-  void (*create_atoms)(void *, int, int64_t *, int *,
+  int (*style_count)(void *, const char *);
-                         double *, double *, int64_t *, int);
+  int (*style_name)(void *, const char *, int, char *, int);
-#else
+
-  void (*create_atoms)(void *, int, int *, int *,
+  int (*has_id)(void *, const char *, const char *);
-                         double *, double *, int *, int);
+  int (*id_count)(void *, const char *);
-#endif
+  int (*id_name)(void *, const char *, int, char *, int);
  int (*plugin_count)();
  int (*plugin_name)(int, char *, char *, int);
  void (*set_fix_external_callback)(void *, const char *, FixExternalFnPtr, void*);
  void (*fix_external_get_force)(void *, const char *);
  void (*fix_external_set_energy_global)(void *, const char *, double);
  void (*fix_external_set_energy_peratom)(void *, const char *, double *);
  void (*fix_external_set_virial_global)(void *, const char *, double *);
  void (*fix_external_set_virial_peratom)(void *, const char *, double **);
  void (*fix_external_set_vector_length)(void *, const char *, int);
  void (*fix_external_set_vector)(void *, const char *, int, double);
  void (*free)(void *);
  void (*is_running)(void *);
  void (*force_timeout)(void *);
  int (*has_error)(void *);
  int (*get_last_error_message)(void *, char *, int);
--- a/examples/COUPLE/plugin/log.simple.plugin.1
+++ b/examples/COUPLE/plugin/log.simple.plugin.1
@ -1,9 +1,12 @@
-LAMMPS (18 Feb 2020)
+LAMMPS (31 Aug 2021)
-Lattice spacing in x,y,z = 1.6796 1.6796 1.6796
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98)
-Created orthogonal box = (0 0 0) to (6.71838 6.71838 6.71838)
+  using 1 OpenMP thread(s) per MPI task
 Lattice spacing in x,y,z = 1.6795962 1.6795962 1.6795962
 Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (6.7183848 6.7183848 6.7183848)
  1 by 1 by 1 MPI processor grid
 Created 256 atoms
-  create_atoms CPU = 0.000297844 secs
+  using lattice units in orthogonal box = (0.0000000 0.0000000 0.0000000) to (6.7183848 6.7183848 6.7183848)
  create_atoms CPU = 0.001 seconds
 Neighbor list info ...
  update every 20 steps, delay 0 steps, check no
  max neighbors/atom: 2000, page size: 100000
@ -14,108 +17,108 @@ Neighbor list info ...
  (1) pair lj/cut, perpetual
      attributes: half, newton on
      pair build: half/bin/atomonly/newton
-      stencil: half/bin/3d/newton
+      stencil: half/bin/3d
      bin: standard
 Setting up Verlet run ...
  Unit style    : lj
  Current step  : 0
  Time step     : 0.005
-Per MPI rank memory allocation (min/avg/max) = 2.63 | 2.63 | 2.63 Mbytes
+Per MPI rank memory allocation (min/avg/max) = 2.630 | 2.630 | 2.630 Mbytes
 Step Temp E_pair E_mol TotEng Press 
       0         1.44   -6.7733681            0   -4.6218056   -5.0244179 
      10    1.1298532   -6.3095502            0   -4.6213906   -2.6058175 
-Loop time of 0.00164276 on 1 procs for 10 steps with 256 atoms
+Loop time of 0.00239712 on 1 procs for 10 steps with 256 atoms
-Performance: 2629719.113 tau/day, 6087.313 timesteps/s
+Performance: 1802163.347 tau/day, 4171.674 timesteps/s
-93.7% CPU use with 1 MPI tasks x no OpenMP threads
+97.2% CPU use with 1 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.0014956  | 0.0014956  | 0.0014956  |   0.0 | 91.04
+Pair    | 0.0020572  | 0.0020572  | 0.0020572  |   0.0 | 85.82
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 8.045e-05  | 8.045e-05  | 8.045e-05  |   0.0 |  4.90
+Comm    | 0.00018731 | 0.00018731 | 0.00018731 |   0.0 |  7.81
-Output  | 1.1399e-05 | 1.1399e-05 | 1.1399e-05 |   0.0 |  0.69
+Output  | 4.478e-05  | 4.478e-05  | 4.478e-05  |   0.0 |  1.87
-Modify  | 3.7431e-05 | 3.7431e-05 | 3.7431e-05 |   0.0 |  2.28
+Modify  | 6.3637e-05 | 6.3637e-05 | 6.3637e-05 |   0.0 |  2.65
-Other   |            | 1.789e-05  |            |       |  1.09
+Other   |            | 4.419e-05  |            |       |  1.84
-Nlocal:    256 ave 256 max 256 min
+Nlocal:        256.000 ave         256 max         256 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    1431 ave 1431 max 1431 min
+Nghost:        1431.00 ave        1431 max        1431 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    9984 ave 9984 max 9984 min
+Neighs:        9984.00 ave        9984 max        9984 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Total # of neighbors = 9984
-Ave neighs/atom = 39
+Ave neighs/atom = 39.000000
 Neighbor list builds = 0
 Dangerous builds not checked
 Setting up Verlet run ...
  Unit style    : lj
  Current step  : 10
  Time step     : 0.005
-Per MPI rank memory allocation (min/avg/max) = 2.63 | 2.63 | 2.63 Mbytes
+Per MPI rank memory allocation (min/avg/max) = 2.630 | 2.630 | 2.630 Mbytes
 Step Temp E_pair E_mol TotEng Press 
      10    1.1298532   -6.3095502            0   -4.6213906   -2.6058175 
      20    0.6239063    -5.557644            0   -4.6254403   0.97451173 
-Loop time of 0.00199768 on 1 procs for 10 steps with 256 atoms
+Loop time of 0.00329271 on 1 procs for 10 steps with 256 atoms
-Performance: 2162504.180 tau/day, 5005.797 timesteps/s
+Performance: 1311987.619 tau/day, 3037.008 timesteps/s
-99.8% CPU use with 1 MPI tasks x no OpenMP threads
+96.4% CPU use with 1 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.0018518  | 0.0018518  | 0.0018518  |   0.0 | 92.70
+Pair    | 0.0029015  | 0.0029015  | 0.0029015  |   0.0 | 88.12
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 7.9768e-05 | 7.9768e-05 | 7.9768e-05 |   0.0 |  3.99
+Comm    | 0.00021807 | 0.00021807 | 0.00021807 |   0.0 |  6.62
-Output  | 1.1433e-05 | 1.1433e-05 | 1.1433e-05 |   0.0 |  0.57
+Output  | 4.9163e-05 | 4.9163e-05 | 4.9163e-05 |   0.0 |  1.49
-Modify  | 3.6904e-05 | 3.6904e-05 | 3.6904e-05 |   0.0 |  1.85
+Modify  | 7.0573e-05 | 7.0573e-05 | 7.0573e-05 |   0.0 |  2.14
-Other   |            | 1.773e-05  |            |       |  0.89
+Other   |            | 5.339e-05  |            |       |  1.62
-Nlocal:    256 ave 256 max 256 min
+Nlocal:        256.000 ave         256 max         256 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    1431 ave 1431 max 1431 min
+Nghost:        1431.00 ave        1431 max        1431 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    9952 ave 9952 max 9952 min
+Neighs:        9952.00 ave        9952 max        9952 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Total # of neighbors = 9952
-Ave neighs/atom = 38.875
+Ave neighs/atom = 38.875000
 Neighbor list builds = 0
 Dangerous builds not checked
 Setting up Verlet run ...
  Unit style    : lj
  Current step  : 20
  Time step     : 0.005
-Per MPI rank memory allocation (min/avg/max) = 2.63 | 2.63 | 2.63 Mbytes
+Per MPI rank memory allocation (min/avg/max) = 2.630 | 2.630 | 2.630 Mbytes
 Step Temp E_pair E_mol TotEng Press 
      20    0.6239063   -5.5404291            0   -4.6082254    1.0394285 
      21   0.63845863   -5.5628733            0   -4.6089263   0.99398278 
-Loop time of 0.000304321 on 1 procs for 1 steps with 256 atoms
+Loop time of 0.000638039 on 1 procs for 1 steps with 256 atoms
-Performance: 1419553.695 tau/day, 3286.004 timesteps/s
+Performance: 677074.599 tau/day, 1567.302 timesteps/s
-98.9% CPU use with 1 MPI tasks x no OpenMP threads
+98.9% CPU use with 1 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.00027815 | 0.00027815 | 0.00027815 |   0.0 | 91.40
+Pair    | 0.00042876 | 0.00042876 | 0.00042876 |   0.0 | 67.20
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 8.321e-06  | 8.321e-06  | 8.321e-06  |   0.0 |  2.73
+Comm    | 5.2872e-05 | 5.2872e-05 | 5.2872e-05 |   0.0 |  8.29
-Output  | 1.0513e-05 | 1.0513e-05 | 1.0513e-05 |   0.0 |  3.45
+Output  | 0.00012218 | 0.00012218 | 0.00012218 |   0.0 | 19.15
-Modify  | 3.968e-06  | 3.968e-06  | 3.968e-06  |   0.0 |  1.30
+Modify  | 1.3762e-05 | 1.3762e-05 | 1.3762e-05 |   0.0 |  2.16
-Other   |            | 3.365e-06  |            |       |  1.11
+Other   |            | 2.047e-05  |            |       |  3.21
-Nlocal:    256 ave 256 max 256 min
+Nlocal:        256.000 ave         256 max         256 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    1431 ave 1431 max 1431 min
+Nghost:        1431.00 ave        1431 max        1431 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    9705 ave 9705 max 9705 min
+Neighs:        9705.00 ave        9705 max        9705 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Total # of neighbors = 9705
-Ave neighs/atom = 37.9102
+Ave neighs/atom = 37.910156
 Neighbor list builds = 0
 Dangerous builds not checked
 Force on 1 atom via extract_atom: 26.9581
@ -124,136 +127,136 @@ Setting up Verlet run ...
  Unit style    : lj
  Current step  : 21
  Time step     : 0.005
-Per MPI rank memory allocation (min/avg/max) = 2.63 | 2.63 | 2.63 Mbytes
+Per MPI rank memory allocation (min/avg/max) = 2.630 | 2.630 | 2.630 Mbytes
 Step Temp E_pair E_mol TotEng Press 
      21   0.63845863   -5.5628733            0   -4.6089263   0.99398278 
      31    0.7494946   -5.7306417            0   -4.6107913   0.41043597 
-Loop time of 0.00196027 on 1 procs for 10 steps with 256 atoms
+Loop time of 0.00281277 on 1 procs for 10 steps with 256 atoms
-Performance: 2203779.175 tau/day, 5101.341 timesteps/s
+Performance: 1535852.558 tau/day, 3555.214 timesteps/s
-99.7% CPU use with 1 MPI tasks x no OpenMP threads
+92.6% CPU use with 1 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.0018146  | 0.0018146  | 0.0018146  |   0.0 | 92.57
+Pair    | 0.0024599  | 0.0024599  | 0.0024599  |   0.0 | 87.45
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 8.0268e-05 | 8.0268e-05 | 8.0268e-05 |   0.0 |  4.09
+Comm    | 0.00020234 | 0.00020234 | 0.00020234 |   0.0 |  7.19
-Output  | 1.0973e-05 | 1.0973e-05 | 1.0973e-05 |   0.0 |  0.56
+Output  | 3.6436e-05 | 3.6436e-05 | 3.6436e-05 |   0.0 |  1.30
-Modify  | 3.6913e-05 | 3.6913e-05 | 3.6913e-05 |   0.0 |  1.88
+Modify  | 6.7542e-05 | 6.7542e-05 | 6.7542e-05 |   0.0 |  2.40
-Other   |            | 1.756e-05  |            |       |  0.90
+Other   |            | 4.655e-05  |            |       |  1.65
-Nlocal:    256 ave 256 max 256 min
+Nlocal:        256.000 ave         256 max         256 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    1431 ave 1431 max 1431 min
+Nghost:        1431.00 ave        1431 max        1431 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    9688 ave 9688 max 9688 min
+Neighs:        9688.00 ave        9688 max        9688 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Total # of neighbors = 9688
-Ave neighs/atom = 37.8438
+Ave neighs/atom = 37.843750
 Neighbor list builds = 0
 Dangerous builds not checked
 Setting up Verlet run ...
  Unit style    : lj
  Current step  : 31
  Time step     : 0.005
-Per MPI rank memory allocation (min/avg/max) = 2.63 | 2.63 | 2.63 Mbytes
+Per MPI rank memory allocation (min/avg/max) = 2.630 | 2.630 | 2.630 Mbytes
 Step Temp E_pair E_mol TotEng Press 
      31    0.7494946   -5.7306417            0   -4.6107913   0.41043597 
      51   0.71349216   -5.6772387            0   -4.6111811   0.52117681 
-Loop time of 0.00433063 on 1 procs for 20 steps with 256 atoms
+Loop time of 0.00560916 on 1 procs for 20 steps with 256 atoms
-Performance: 1995088.941 tau/day, 4618.261 timesteps/s
+Performance: 1540338.414 tau/day, 3565.598 timesteps/s
-99.3% CPU use with 1 MPI tasks x no OpenMP threads
+99.2% CPU use with 1 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.0035121  | 0.0035121  | 0.0035121  |   0.0 | 81.10
+Pair    | 0.0044403  | 0.0044403  | 0.0044403  |   0.0 | 79.16
-Neigh   | 0.00050258 | 0.00050258 | 0.00050258 |   0.0 | 11.61
+Neigh   | 0.00056186 | 0.00056186 | 0.00056186 |   0.0 | 10.02
-Comm    | 0.00019444 | 0.00019444 | 0.00019444 |   0.0 |  4.49
+Comm    | 0.00036797 | 0.00036797 | 0.00036797 |   0.0 |  6.56
-Output  | 1.2092e-05 | 1.2092e-05 | 1.2092e-05 |   0.0 |  0.28
+Output  | 3.676e-05  | 3.676e-05  | 3.676e-05  |   0.0 |  0.66
-Modify  | 7.2917e-05 | 7.2917e-05 | 7.2917e-05 |   0.0 |  1.68
+Modify  | 0.00011282 | 0.00011282 | 0.00011282 |   0.0 |  2.01
-Other   |            | 3.647e-05  |            |       |  0.84
+Other   |            | 8.943e-05  |            |       |  1.59
-Nlocal:    256 ave 256 max 256 min
+Nlocal:        256.000 ave         256 max         256 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    1421 ave 1421 max 1421 min
+Nghost:        1421.00 ave        1421 max        1421 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    9700 ave 9700 max 9700 min
+Neighs:        9700.00 ave        9700 max        9700 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Total # of neighbors = 9700
-Ave neighs/atom = 37.8906
+Ave neighs/atom = 37.890625
 Neighbor list builds = 1
 Dangerous builds not checked
 Setting up Verlet run ...
  Unit style    : lj
  Current step  : 51
  Time step     : 0.005
-Per MPI rank memory allocation (min/avg/max) = 2.63 | 2.63 | 2.63 Mbytes
+Per MPI rank memory allocation (min/avg/max) = 2.630 | 2.630 | 2.630 Mbytes
 Step Temp E_pair E_mol TotEng Press 
      51   0.71349216   -5.6772387            0   -4.6111811   0.52117681 
      61   0.78045421   -5.7781094            0   -4.6120011  0.093808941 
-Loop time of 0.00196567 on 1 procs for 10 steps with 256 atoms
+Loop time of 0.00373815 on 1 procs for 10 steps with 256 atoms
-Performance: 2197727.285 tau/day, 5087.332 timesteps/s
+Performance: 1155650.623 tau/day, 2675.117 timesteps/s
-99.7% CPU use with 1 MPI tasks x no OpenMP threads
+98.0% CPU use with 1 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.0018222  | 0.0018222  | 0.0018222  |   0.0 | 92.70
+Pair    | 0.0030908  | 0.0030908  | 0.0030908  |   0.0 | 82.68
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 7.8285e-05 | 7.8285e-05 | 7.8285e-05 |   0.0 |  3.98
+Comm    | 0.00038189 | 0.00038189 | 0.00038189 |   0.0 | 10.22
-Output  | 1.0862e-05 | 1.0862e-05 | 1.0862e-05 |   0.0 |  0.55
+Output  | 4.1615e-05 | 4.1615e-05 | 4.1615e-05 |   0.0 |  1.11
-Modify  | 3.6719e-05 | 3.6719e-05 | 3.6719e-05 |   0.0 |  1.87
+Modify  | 0.00013851 | 0.00013851 | 0.00013851 |   0.0 |  3.71
-Other   |            | 1.764e-05  |            |       |  0.90
+Other   |            | 8.533e-05  |            |       |  2.28
-Nlocal:    256 ave 256 max 256 min
+Nlocal:        256.000 ave         256 max         256 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    1421 ave 1421 max 1421 min
+Nghost:        1421.00 ave        1421 max        1421 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    9700 ave 9700 max 9700 min
+Neighs:        9700.00 ave        9700 max        9700 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Total # of neighbors = 9700
-Ave neighs/atom = 37.8906
+Ave neighs/atom = 37.890625
 Neighbor list builds = 0
 Dangerous builds not checked
 Setting up Verlet run ...
  Unit style    : lj
  Current step  : 61
  Time step     : 0.005
-Per MPI rank memory allocation (min/avg/max) = 2.63 | 2.63 | 2.63 Mbytes
+Per MPI rank memory allocation (min/avg/max) = 2.630 | 2.630 | 2.630 Mbytes
 Step Temp E_pair E_mol TotEng Press 
      61   0.78045421   -5.7781094            0   -4.6120011  0.093808941 
      81   0.77743907   -5.7735004            0   -4.6118971  0.090822641 
-Loop time of 0.00430528 on 1 procs for 20 steps with 256 atoms
+Loop time of 0.00612177 on 1 procs for 20 steps with 256 atoms
-Performance: 2006838.581 tau/day, 4645.460 timesteps/s
+Performance: 1411356.519 tau/day, 3267.029 timesteps/s
-99.8% CPU use with 1 MPI tasks x no OpenMP threads
+98.6% CPU use with 1 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.0034931  | 0.0034931  | 0.0034931  |   0.0 | 81.13
+Pair    | 0.0047211  | 0.0047211  | 0.0047211  |   0.0 | 77.12
-Neigh   | 0.00050437 | 0.00050437 | 0.00050437 |   0.0 | 11.72
+Neigh   | 0.00083088 | 0.00083088 | 0.00083088 |   0.0 | 13.57
-Comm    | 0.0001868  | 0.0001868  | 0.0001868  |   0.0 |  4.34
+Comm    | 0.00032716 | 0.00032716 | 0.00032716 |   0.0 |  5.34
-Output  | 1.1699e-05 | 1.1699e-05 | 1.1699e-05 |   0.0 |  0.27
+Output  | 3.9891e-05 | 3.9891e-05 | 3.9891e-05 |   0.0 |  0.65
-Modify  | 7.3308e-05 | 7.3308e-05 | 7.3308e-05 |   0.0 |  1.70
+Modify  | 0.00010926 | 0.00010926 | 0.00010926 |   0.0 |  1.78
-Other   |            | 3.604e-05  |            |       |  0.84
+Other   |            | 9.346e-05  |            |       |  1.53
-Nlocal:    256 ave 256 max 256 min
+Nlocal:        256.000 ave         256 max         256 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    1405 ave 1405 max 1405 min
+Nghost:        1405.00 ave        1405 max        1405 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    9701 ave 9701 max 9701 min
+Neighs:        9701.00 ave        9701 max        9701 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Total # of neighbors = 9701
-Ave neighs/atom = 37.8945
+Ave neighs/atom = 37.894531
 Neighbor list builds = 1
 Dangerous builds not checked
 Deleted 256 atoms, new total = 0
@ -261,34 +264,34 @@ Setting up Verlet run ...
  Unit style    : lj
  Current step  : 81
  Time step     : 0.005
-Per MPI rank memory allocation (min/avg/max) = 2.63 | 2.63 | 2.63 Mbytes
+Per MPI rank memory allocation (min/avg/max) = 2.630 | 2.630 | 2.630 Mbytes
 Step Temp E_pair E_mol TotEng Press 
      81    0.6239063   -5.5404291            0   -4.6082254    1.0394285 
      91   0.75393007   -5.7375259            0   -4.6110484   0.39357367 
-Loop time of 0.00195843 on 1 procs for 10 steps with 256 atoms
+Loop time of 0.00319065 on 1 procs for 10 steps with 256 atoms
-Performance: 2205851.941 tau/day, 5106.139 timesteps/s
+Performance: 1353954.393 tau/day, 3134.154 timesteps/s
-99.7% CPU use with 1 MPI tasks x no OpenMP threads
+99.2% CPU use with 1 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.0018143  | 0.0018143  | 0.0018143  |   0.0 | 92.64
+Pair    | 0.0027828  | 0.0027828  | 0.0027828  |   0.0 | 87.22
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 7.8608e-05 | 7.8608e-05 | 7.8608e-05 |   0.0 |  4.01
+Comm    | 0.00023286 | 0.00023286 | 0.00023286 |   0.0 |  7.30
-Output  | 1.0786e-05 | 1.0786e-05 | 1.0786e-05 |   0.0 |  0.55
+Output  | 4.0459e-05 | 4.0459e-05 | 4.0459e-05 |   0.0 |  1.27
-Modify  | 3.7106e-05 | 3.7106e-05 | 3.7106e-05 |   0.0 |  1.89
+Modify  | 7.3576e-05 | 7.3576e-05 | 7.3576e-05 |   0.0 |  2.31
-Other   |            | 1.762e-05  |            |       |  0.90
+Other   |            | 6.094e-05  |            |       |  1.91
-Nlocal:    256 ave 256 max 256 min
+Nlocal:        256.000 ave         256 max         256 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    1431 ave 1431 max 1431 min
+Nghost:        1431.00 ave        1431 max        1431 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    9705 ave 9705 max 9705 min
+Neighs:        9705.00 ave        9705 max        9705 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Total # of neighbors = 9705
-Ave neighs/atom = 37.9102
+Ave neighs/atom = 37.910156
 Neighbor list builds = 0
 Dangerous builds not checked
 Total wall time: 0:00:00
--- a/examples/COUPLE/plugin/log.simple.plugin.4
+++ b/examples/COUPLE/plugin/log.simple.plugin.4
@ -1,9 +1,12 @@
-LAMMPS (18 Feb 2020)
+LAMMPS (31 Aug 2021)
-Lattice spacing in x,y,z = 1.6796 1.6796 1.6796
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98)
-Created orthogonal box = (0 0 0) to (6.71838 6.71838 6.71838)
+  using 1 OpenMP thread(s) per MPI task
 Lattice spacing in x,y,z = 1.6795962 1.6795962 1.6795962
 Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (6.7183848 6.7183848 6.7183848)
  1 by 1 by 2 MPI processor grid
 Created 256 atoms
-  create_atoms CPU = 0.000265157 secs
+  using lattice units in orthogonal box = (0.0000000 0.0000000 0.0000000) to (6.7183848 6.7183848 6.7183848)
  create_atoms CPU = 0.003 seconds
 Neighbor list info ...
  update every 20 steps, delay 0 steps, check no
  max neighbors/atom: 2000, page size: 100000
@ -14,7 +17,7 @@ Neighbor list info ...
  (1) pair lj/cut, perpetual
      attributes: half, newton on
      pair build: half/bin/atomonly/newton
-      stencil: half/bin/3d/newton
+      stencil: half/bin/3d
      bin: standard
 Setting up Verlet run ...
  Unit style    : lj
@ -24,30 +27,30 @@ Per MPI rank memory allocation (min/avg/max) = 2.624 | 2.624 | 2.624 Mbytes
 Step Temp E_pair E_mol TotEng Press 
       0         1.44   -6.7733681            0   -4.6218056   -5.0244179 
      10    1.1298532   -6.3095502            0   -4.6213906   -2.6058175 
-Loop time of 0.00115264 on 2 procs for 10 steps with 256 atoms
+Loop time of 0.00330899 on 2 procs for 10 steps with 256 atoms
-Performance: 3747912.946 tau/day, 8675.724 timesteps/s
+Performance: 1305535.501 tau/day, 3022.073 timesteps/s
-94.5% CPU use with 2 MPI tasks x no OpenMP threads
+75.7% CPU use with 2 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.00074885 | 0.00075021 | 0.00075156 |   0.0 | 65.09
+Pair    | 0.0013522  | 0.0013813  | 0.0014104  |   0.1 | 41.74
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 0.00031829 | 0.00031943 | 0.00032056 |   0.0 | 27.71
+Comm    | 0.00049139 | 0.00054241 | 0.00059342 |   0.0 | 16.39
-Output  | 9.306e-06  | 2.6673e-05 | 4.4041e-05 |   0.0 |  2.31
+Output  | 3.6986e-05 | 0.00056588 | 0.0010948  |   0.0 | 17.10
-Modify  | 2.0684e-05 | 2.0891e-05 | 2.1098e-05 |   0.0 |  1.81
+Modify  | 4.3909e-05 | 4.3924e-05 | 4.3939e-05 |   0.0 |  1.33
-Other   |            | 3.544e-05  |            |       |  3.07
+Other   |            | 0.0007755  |            |       | 23.44
-Nlocal:    128 ave 128 max 128 min
+Nlocal:        128.000 ave         128 max         128 min
 Histogram: 2 0 0 0 0 0 0 0 0 0
-Nghost:    1109 ave 1109 max 1109 min
+Nghost:        1109.00 ave        1109 max        1109 min
 Histogram: 2 0 0 0 0 0 0 0 0 0
-Neighs:    4992 ave 4992 max 4992 min
+Neighs:        4992.00 ave        4992 max        4992 min
 Histogram: 2 0 0 0 0 0 0 0 0 0
 Total # of neighbors = 9984
-Ave neighs/atom = 39
+Ave neighs/atom = 39.000000
 Neighbor list builds = 0
 Dangerous builds not checked
 Setting up Verlet run ...
@ -58,30 +61,30 @@ Per MPI rank memory allocation (min/avg/max) = 2.624 | 2.624 | 2.624 Mbytes
 Step Temp E_pair E_mol TotEng Press 
      10    1.1298532   -6.3095502            0   -4.6213906   -2.6058175 
      20    0.6239063    -5.557644            0   -4.6254403   0.97451173 
-Loop time of 0.00120443 on 2 procs for 10 steps with 256 atoms
+Loop time of 0.00648485 on 2 procs for 10 steps with 256 atoms
-Performance: 3586761.860 tau/day, 8302.689 timesteps/s
+Performance: 666168.017 tau/day, 1542.056 timesteps/s
-95.5% CPU use with 2 MPI tasks x no OpenMP threads
+44.3% CPU use with 2 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.00087798 | 0.00091359 | 0.0009492  |   0.0 | 75.85
+Pair    | 0.0022373  | 0.0024405  | 0.0026437  |   0.4 | 37.63
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 0.00016739 | 0.00020368 | 0.00023997 |   0.0 | 16.91
+Comm    | 0.0024446  | 0.0026464  | 0.0028481  |   0.4 | 40.81
-Output  | 1.0124e-05 | 3.0513e-05 | 5.0901e-05 |   0.0 |  2.53
+Output  | 3.9069e-05 | 0.00059734 | 0.0011556  |   0.0 |  9.21
-Modify  | 1.89e-05   | 1.9812e-05 | 2.0725e-05 |   0.0 |  1.64
+Modify  | 4.869e-05  | 4.912e-05  | 4.9551e-05 |   0.0 |  0.76
-Other   |            | 3.683e-05  |            |       |  3.06
+Other   |            | 0.0007515  |            |       | 11.59
-Nlocal:    128 ave 134 max 122 min
+Nlocal:        128.000 ave         134 max         122 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
-Nghost:    1109 ave 1115 max 1103 min
+Nghost:        1109.00 ave        1115 max        1103 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
-Neighs:    4976 ave 5205 max 4747 min
+Neighs:        4976.00 ave        5205 max        4747 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
 Total # of neighbors = 9952
-Ave neighs/atom = 38.875
+Ave neighs/atom = 38.875000
 Neighbor list builds = 0
 Dangerous builds not checked
 Setting up Verlet run ...
@ -92,34 +95,34 @@ Per MPI rank memory allocation (min/avg/max) = 2.624 | 2.624 | 2.624 Mbytes
 Step Temp E_pair E_mol TotEng Press 
      20    0.6239063   -5.5404291            0   -4.6082254    1.0394285 
      21   0.63845863   -5.5628733            0   -4.6089263   0.99398278 
-Loop time of 0.000206062 on 2 procs for 1 steps with 256 atoms
+Loop time of 0.00128072 on 2 procs for 1 steps with 256 atoms
-Performance: 2096456.406 tau/day, 4852.908 timesteps/s
+Performance: 337310.921 tau/day, 780.812 timesteps/s
-94.1% CPU use with 2 MPI tasks x no OpenMP threads
+60.2% CPU use with 2 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.00012947 | 0.00013524 | 0.00014101 |   0.0 | 65.63
+Pair    | 0.00047351 | 0.00049064 | 0.00050777 |   0.0 | 38.31
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 1.858e-05  | 2.4113e-05 | 2.9647e-05 |   0.0 | 11.70
+Comm    | 7.6767e-05 | 9.3655e-05 | 0.00011054 |   0.0 |  7.31
-Output  | 8.699e-06  | 2.4204e-05 | 3.9708e-05 |   0.0 | 11.75
+Output  | 5.4217e-05 | 0.00026297 | 0.00047172 |   0.0 | 20.53
-Modify  | 2.34e-06   | 2.3705e-06 | 2.401e-06  |   0.0 |  1.15
+Modify  | 1.1554e-05 | 1.2026e-05 | 1.2498e-05 |   0.0 |  0.94
-Other   |            | 2.013e-05  |            |       |  9.77
+Other   |            | 0.0004214  |            |       | 32.91
-Nlocal:    128 ave 135 max 121 min
+Nlocal:        128.000 ave         135 max         121 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
-Nghost:    1109 ave 1116 max 1102 min
+Nghost:        1109.00 ave        1116 max        1102 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
-Neighs:    4852.5 ave 5106 max 4599 min
+Neighs:        4852.50 ave        5106 max        4599 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
 Total # of neighbors = 9705
-Ave neighs/atom = 37.9102
+Ave neighs/atom = 37.910156
 Force on 1 atom via extract_atom: -18.109
 Force on 1 atom via extract_variable: -18.109
 Neighbor list builds = 0
 Dangerous builds not checked
 Force on 1 atom via extract_atom: -18.109
 Force on 1 atom via extract_variable: -18.109
 Force on 1 atom via extract_atom: 26.9581
 Force on 1 atom via extract_variable: 26.9581
 Setting up Verlet run ...
@ -130,30 +133,30 @@ Per MPI rank memory allocation (min/avg/max) = 2.624 | 2.624 | 2.624 Mbytes
 Step Temp E_pair E_mol TotEng Press 
      21   0.63845863   -5.5628733            0   -4.6089263   0.99398278 
      31    0.7494946   -5.7306417            0   -4.6107913   0.41043597 
-Loop time of 0.00119048 on 2 procs for 10 steps with 256 atoms
+Loop time of 0.00784933 on 2 procs for 10 steps with 256 atoms
-Performance: 3628802.105 tau/day, 8400.005 timesteps/s
+Performance: 550365.761 tau/day, 1273.995 timesteps/s
-98.0% CPU use with 2 MPI tasks x no OpenMP threads
+59.6% CPU use with 2 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.00085276 | 0.00089699 | 0.00094123 |   0.0 | 75.35
+Pair    | 0.0019235  | 0.0033403  | 0.0047572  |   2.5 | 42.56
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 0.00016896 | 0.00021444 | 0.00025992 |   0.0 | 18.01
+Comm    | 0.0016948  | 0.003118   | 0.0045411  |   2.5 | 39.72
-Output  | 9.413e-06  | 2.5939e-05 | 4.2465e-05 |   0.0 |  2.18
+Output  | 3.6445e-05 | 0.00064636 | 0.0012563  |   0.0 |  8.23
-Modify  | 1.8977e-05 | 2.0009e-05 | 2.1042e-05 |   0.0 |  1.68
+Modify  | 6.2842e-05 | 6.3209e-05 | 6.3577e-05 |   0.0 |  0.81
-Other   |            | 3.31e-05   |            |       |  2.78
+Other   |            | 0.0006814  |            |       |  8.68
-Nlocal:    128 ave 135 max 121 min
+Nlocal:        128.000 ave         135 max         121 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
-Nghost:    1109 ave 1116 max 1102 min
+Nghost:        1109.00 ave        1116 max        1102 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
-Neighs:    4844 ave 5096 max 4592 min
+Neighs:        4844.00 ave        5096 max        4592 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
 Total # of neighbors = 9688
-Ave neighs/atom = 37.8438
+Ave neighs/atom = 37.843750
 Neighbor list builds = 0
 Dangerous builds not checked
 Setting up Verlet run ...
@ -164,30 +167,30 @@ Per MPI rank memory allocation (min/avg/max) = 2.624 | 2.624 | 2.624 Mbytes
 Step Temp E_pair E_mol TotEng Press 
      31    0.7494946   -5.7306417            0   -4.6107913   0.41043597 
      51   0.71349216   -5.6772387            0   -4.6111811   0.52117681 
-Loop time of 0.00252603 on 2 procs for 20 steps with 256 atoms
+Loop time of 0.00696051 on 2 procs for 20 steps with 256 atoms
-Performance: 3420382.192 tau/day, 7917.551 timesteps/s
+Performance: 1241287.730 tau/day, 2873.351 timesteps/s
-99.2% CPU use with 2 MPI tasks x no OpenMP threads
+79.2% CPU use with 2 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.0016245  | 0.0017014  | 0.0017784  |   0.2 | 67.36
+Pair    | 0.0028267  | 0.0036088  | 0.004391   |   1.3 | 51.85
-Neigh   | 0.00025359 | 0.0002563  | 0.00025901 |   0.0 | 10.15
+Neigh   | 0.00040272 | 0.00040989 | 0.00041707 |   0.0 |  5.89
-Comm    | 0.00036863 | 0.00045124 | 0.00053385 |   0.0 | 17.86
+Comm    | 0.00081061 | 0.0015825  | 0.0023544  |   1.9 | 22.74
-Output  | 9.839e-06  | 2.8031e-05 | 4.6223e-05 |   0.0 |  1.11
+Output  | 3.6006e-05 | 0.00062106 | 0.0012061  |   0.0 |  8.92
-Modify  | 3.7027e-05 | 3.9545e-05 | 4.2063e-05 |   0.0 |  1.57
+Modify  | 6.8937e-05 | 7.1149e-05 | 7.336e-05  |   0.0 |  1.02
-Other   |            | 4.948e-05  |            |       |  1.96
+Other   |            | 0.0006671  |            |       |  9.58
-Nlocal:    128 ave 132 max 124 min
+Nlocal:        128.000 ave         132 max         124 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
-Nghost:    1100 ave 1101 max 1099 min
+Nghost:        1100.00 ave        1101 max        1099 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
-Neighs:    4850 ave 4953 max 4747 min
+Neighs:        4850.00 ave        4953 max        4747 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
 Total # of neighbors = 9700
-Ave neighs/atom = 37.8906
+Ave neighs/atom = 37.890625
 Neighbor list builds = 1
 Dangerous builds not checked
 Setting up Verlet run ...
@ -198,30 +201,30 @@ Per MPI rank memory allocation (min/avg/max) = 2.624 | 2.624 | 2.624 Mbytes
 Step Temp E_pair E_mol TotEng Press 
      51   0.71349216   -5.6772387            0   -4.6111811   0.52117681 
      61   0.78045421   -5.7781094            0   -4.6120011  0.093808941 
-Loop time of 0.00115444 on 2 procs for 10 steps with 256 atoms
+Loop time of 0.00155862 on 2 procs for 10 steps with 256 atoms
-Performance: 3742065.976 tau/day, 8662.190 timesteps/s
+Performance: 2771678.197 tau/day, 6415.922 timesteps/s
-96.5% CPU use with 2 MPI tasks x no OpenMP threads
+95.0% CPU use with 2 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.00087346 | 0.00089311 | 0.00091275 |   0.0 | 77.36
+Pair    | 0.0012369  | 0.001266   | 0.001295   |   0.1 | 81.22
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 0.00016192 | 0.0001823  | 0.00020269 |   0.0 | 15.79
+Comm    | 0.00019462 | 0.00022315 | 0.00025169 |   0.0 | 14.32
-Output  | 9.49e-06   | 2.6234e-05 | 4.2978e-05 |   0.0 |  2.27
+Output  | 2.0217e-05 | 2.1945e-05 | 2.3673e-05 |   0.0 |  1.41
-Modify  | 1.9095e-05 | 1.9843e-05 | 2.0591e-05 |   0.0 |  1.72
+Modify  | 2.562e-05  | 2.5759e-05 | 2.5898e-05 |   0.0 |  1.65
-Other   |            | 3.296e-05  |            |       |  2.85
+Other   |            | 2.181e-05  |            |       |  1.40
-Nlocal:    128 ave 132 max 124 min
+Nlocal:        128.000 ave         132 max         124 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
-Nghost:    1100 ave 1101 max 1099 min
+Nghost:        1100.00 ave        1101 max        1099 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
-Neighs:    4850 ave 4953 max 4747 min
+Neighs:        4850.00 ave        4953 max        4747 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
 Total # of neighbors = 9700
-Ave neighs/atom = 37.8906
+Ave neighs/atom = 37.890625
 Neighbor list builds = 0
 Dangerous builds not checked
 Setting up Verlet run ...
@ -232,30 +235,30 @@ Per MPI rank memory allocation (min/avg/max) = 2.624 | 2.624 | 2.624 Mbytes
 Step Temp E_pair E_mol TotEng Press 
      61   0.78045421   -5.7781094            0   -4.6120011  0.093808941 
      81   0.77743907   -5.7735004            0   -4.6118971  0.090822641 
-Loop time of 0.00244325 on 2 procs for 20 steps with 256 atoms
+Loop time of 0.00351607 on 2 procs for 20 steps with 256 atoms
-Performance: 3536279.919 tau/day, 8185.833 timesteps/s
+Performance: 2457288.612 tau/day, 5688.168 timesteps/s
-99.0% CPU use with 2 MPI tasks x no OpenMP threads
+97.9% CPU use with 2 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.0016916  | 0.0017038  | 0.001716   |   0.0 | 69.73
+Pair    | 0.0023896  | 0.0024147  | 0.0024397  |   0.1 | 68.67
-Neigh   | 0.00025229 | 0.00025512 | 0.00025795 |   0.0 | 10.44
+Neigh   | 0.00037331 | 0.00040456 | 0.0004358  |   0.0 | 11.51
-Comm    | 0.00035772 | 0.00036918 | 0.00038064 |   0.0 | 15.11
+Comm    | 0.00050571 | 0.00051343 | 0.00052116 |   0.0 | 14.60
-Output  | 1.0858e-05 | 2.7875e-05 | 4.4891e-05 |   0.0 |  1.14
+Output  | 2.6424e-05 | 5.6547e-05 | 8.667e-05  |   0.0 |  1.61
-Modify  | 3.817e-05  | 3.9325e-05 | 4.048e-05  |   0.0 |  1.61
+Modify  | 5.0287e-05 | 5.1071e-05 | 5.1856e-05 |   0.0 |  1.45
-Other   |            | 4.796e-05  |            |       |  1.96
+Other   |            | 7.58e-05   |            |       |  2.16
-Nlocal:    128 ave 128 max 128 min
+Nlocal:        128.000 ave         128 max         128 min
 Histogram: 2 0 0 0 0 0 0 0 0 0
-Nghost:    1088.5 ave 1092 max 1085 min
+Nghost:        1088.50 ave        1092 max        1085 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
-Neighs:    4850.5 ave 4851 max 4850 min
+Neighs:        4850.50 ave        4851 max        4850 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
 Total # of neighbors = 9701
-Ave neighs/atom = 37.8945
+Ave neighs/atom = 37.894531
 Neighbor list builds = 1
 Dangerous builds not checked
 Deleted 256 atoms, new total = 0
@ -267,30 +270,30 @@ Per MPI rank memory allocation (min/avg/max) = 2.624 | 2.624 | 2.624 Mbytes
 Step Temp E_pair E_mol TotEng Press 
      81    0.6239063   -5.5404291            0   -4.6082254    1.0394285 
      91   0.75393007   -5.7375259            0   -4.6110484   0.39357367 
-Loop time of 0.00118092 on 2 procs for 10 steps with 256 atoms
+Loop time of 0.0109747 on 2 procs for 10 steps with 256 atoms
-Performance: 3658158.625 tau/day, 8467.960 timesteps/s
+Performance: 393631.731 tau/day, 911.185 timesteps/s
-98.6% CPU use with 2 MPI tasks x no OpenMP threads
+53.5% CPU use with 2 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.0008476  | 0.00089265 | 0.00093771 |   0.0 | 75.59
+Pair    | 0.0012057  | 0.0012732  | 0.0013407  |   0.2 | 11.60
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 0.00016335 | 0.00020946 | 0.00025557 |   0.0 | 17.74
+Comm    | 0.00018882 | 0.00025686 | 0.00032489 |   0.0 |  2.34
-Output  | 8.87e-06   | 2.5733e-05 | 4.2595e-05 |   0.0 |  2.18
+Output  | 2.1943e-05 | 0.0047067  | 0.0093915  |   6.8 | 42.89
-Modify  | 1.8755e-05 | 1.9814e-05 | 2.0872e-05 |   0.0 |  1.68
+Modify  | 2.4614e-05 | 2.5439e-05 | 2.6264e-05 |   0.0 |  0.23
-Other   |            | 3.326e-05  |            |       |  2.82
+Other   |            | 0.004712   |            |       | 42.94
-Nlocal:    128 ave 135 max 121 min
+Nlocal:        128.000 ave         135 max         121 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
-Nghost:    1109 ave 1116 max 1102 min
+Nghost:        1109.00 ave        1116 max        1102 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
-Neighs:    4852.5 ave 5106 max 4599 min
+Neighs:        4852.50 ave        5106 max        4599 min
 Histogram: 1 0 0 0 0 0 0 0 0 1
 Total # of neighbors = 9705
-Ave neighs/atom = 37.9102
+Ave neighs/atom = 37.910156
 Neighbor list builds = 0
 Dangerous builds not checked
 Total wall time: 0:00:00
--- a/examples/COUPLE/plugin/simple.c
+++ b/examples/COUPLE/plugin/simple.c
@ -87,7 +87,7 @@ int main(int narg, char **arg)
      MPI_Abort(MPI_COMM_WORLD,1);
    }
  }
-  if (lammps == 1) plugin->open(0,NULL,comm_lammps,&lmp);
+  if (lammps == 1) lmp = plugin->open(0,NULL,comm_lammps,NULL);
  while (1) {
    if (me == 0) {
@ -139,7 +139,7 @@ int main(int narg, char **arg)
  cmds[0] = (char *)"run 10";
  cmds[1] = (char *)"run 20";
-  if (lammps == 1) plugin->commands_list(lmp,2,cmds);
+  if (lammps == 1) plugin->commands_list(lmp,2,(const char **)cmds);
  /* delete all atoms
     create_atoms() to create new ones with old coords, vels
@ -164,12 +164,13 @@ int main(int narg, char **arg)
  if (lammps == 1) {
    plugin->close(lmp);
    MPI_Barrier(comm_lammps);
    MPI_Comm_free(&comm_lammps);
    liblammpsplugin_release(plugin);
  }
  /* close down MPI */
  if (lammps == 1) MPI_Comm_free(&comm_lammps);
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Finalize();
 }
--- a/examples/PACKAGES/charge_regulation/in.chreg-polymer
+++ b/examples/PACKAGES/charge_regulation/in.chreg-polymer
@ -8,7 +8,7 @@ bond_style      harmonic
 bond_coeff      1 100 1.122462 # K R0
 velocity        all create 1.0 8008 loop geom
-pair_style      lj/cut/coul/long 1.122462 20
+pair_style      lj/cut/coul/long/soft 2 0.5 10.0  1.122462 20
 pair_coeff      * *  1.0 1.0 1.122462 # charges
 kspace_style    pppm 1.0e-3
 pair_modify     shift yes
--- a/lib/colvars/colvarmodule.cpp
+++ b/lib/colvars/colvarmodule.cpp
@ -1476,7 +1476,9 @@ int colvarmodule::write_output_files()
       bi != biases.end();
       bi++) {
    // Only write output files if they have not already been written this time step
-    if ((*bi)->output_freq == 0 || (cvm::step_absolute() % (*bi)->output_freq) != 0) {
+    if ((*bi)->output_freq == 0    ||
        cvm::step_relative() == 0  ||
        (cvm::step_absolute() % (*bi)->output_freq) != 0) {
      error_code |= (*bi)->write_output_files();
    }
    error_code |= (*bi)->write_state_to_replicas();
--- a/lib/colvars/colvars_version.h
+++ b/lib/colvars/colvars_version.h
@ -1,3 +1,3 @@
 #ifndef COLVARS_VERSION
-#define COLVARS_VERSION "2021-08-06"
+#define COLVARS_VERSION "2021-09-21"
 #endif
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@ -462,7 +462,6 @@ int UCL_Device::set_platform(int pid) {
  _num_devices = 0;
  for (int i=0; i<num_unpart; i++) {
    cl_uint num_subdevices = 1;
    cl_device_id *subdevice_list = device_list + i;
    #ifdef CL_VERSION_1_2
    cl_device_affinity_domain adomain;
@ -479,19 +478,21 @@ int UCL_Device::set_platform(int pid) {
      CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, 0, NULL,
                                      &num_subdevices));
    if (num_subdevices > 1) {
-      subdevice_list = new cl_device_id[num_subdevices];
+      cl_device_id *subdevice_list = new cl_device_id[num_subdevices];
      CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, num_subdevices,
                                      subdevice_list, &num_subdevices));
      for (int j=0; j<num_subdevices; j++) {
        _cl_devices.push_back(device_list[i]);
        add_properties(device_list[i]);
        _num_devices++;
      }
      delete[] subdevice_list;
    } else {
      _cl_devices.push_back(device_list[i]);
      add_properties(device_list[i]);
      _num_devices++;
    }
    #endif
    for (int j=0; j<num_subdevices; j++) {
      _num_devices++;
      _cl_devices.push_back(subdevice_list[j]);
      add_properties(subdevice_list[j]);
    }
    if (num_subdevices > 1) delete[] subdevice_list;
  } // for i
  #endif
@ -555,16 +556,22 @@ void UCL_Device::add_properties(cl_device_id device_list) {
                               sizeof(float_width),&float_width,nullptr));
  op.preferred_vector_width32=float_width;
  // Determine if double precision is supported
  cl_uint double_width;
  CL_SAFE_CALL(clGetDeviceInfo(device_list,
                               CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
                               sizeof(double_width),&double_width,nullptr));
  op.preferred_vector_width64=double_width;
-  if (double_width==0)
+
-    op.double_precision=false;
+  // Determine if double precision is supported: All bits in the mask must be set.
-  else
+  cl_device_fp_config double_mask = (CL_FP_FMA|CL_FP_ROUND_TO_NEAREST|CL_FP_ROUND_TO_ZERO|
                                     CL_FP_ROUND_TO_INF|CL_FP_INF_NAN|CL_FP_DENORM);
  cl_device_fp_config double_avail;
  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_DOUBLE_FP_CONFIG,
                               sizeof(double_avail),&double_avail,nullptr));
  if ((double_avail & double_mask) == double_mask)
    op.double_precision=true;
  else
    op.double_precision=false;
  CL_SAFE_CALL(clGetDeviceInfo(device_list,
                               CL_DEVICE_PROFILING_TIMER_RESOLUTION,
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@ -38,8 +38,10 @@ namespace ucl_opencl {
 /// Class for timing OpenCL events
 class UCL_Timer {
 public:
-  inline UCL_Timer() : _total_time(0.0f), _initialized(false), has_measured_time(false) { }
+  inline UCL_Timer() : start_event(nullptr), stop_event(nullptr), _total_time(0.0f),
-  inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false), has_measured_time(false)
+                       _initialized(false), has_measured_time(false) { }
  inline UCL_Timer(UCL_Device &dev) : start_event(nullptr), stop_event(nullptr), _total_time(0.0f),
                                      _initialized(false), has_measured_time(false)
    { init(dev); }
  inline ~UCL_Timer() { clear(); }
--- a/lib/gpu/lal_answer.h
+++ b/lib/gpu/lal_answer.h
@ -127,9 +127,8 @@ class Answer {
  /// Add forces and torques from the GPU into a LAMMPS pointer
  void get_answers(double **f, double **tor);
-  inline double get_answers(double **f, double **tor, double *eatom,
+  inline double get_answers(double **f, double **tor, double *eatom, double **vatom,
-                            double **vatom, double *virial, double &ecoul,
+                            double *virial, double &ecoul, int &error_flag_in) {
                            int &error_flag_in) {
    double ta=MPI_Wtime();
    time_answer.sync_stop();
    _time_cpu_idle+=MPI_Wtime()-ta;
--- a/lib/gpu/lal_born_coul_long.cpp
+++ b/lib/gpu/lal_born_coul_long.cpp
@ -34,7 +34,7 @@ BornCoulLongT::BornCoulLong() : BaseCharge<numtyp,acctyp>(),
 }
 template <class numtyp, class acctyp>
-BornCoulLongT::~BornCoulLongT() {
+BornCoulLongT::~BornCoulLong() {
  clear();
 }
--- a/lib/gpu/lal_born_coul_wolf.cpp
+++ b/lib/gpu/lal_born_coul_wolf.cpp
@ -34,7 +34,7 @@ BornCoulWolfT::BornCoulWolf() : BaseCharge<numtyp,acctyp>(),
 }
 template <class numtyp, class acctyp>
-BornCoulWolfT::~BornCoulWolfT() {
+BornCoulWolfT::~BornCoulWolf() {
  clear();
 }
--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@ -34,7 +34,7 @@ BuckCoulLongT::BuckCoulLong() : BaseCharge<numtyp,acctyp>(),
 }
 template <class numtyp, class acctyp>
-BuckCoulLongT::~BuckCoulLongT() {
+BuckCoulLongT::~BuckCoulLong() {
  clear();
 }
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -333,6 +333,12 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
    gpu_barrier();
  }
  // check if double precision support is available
  #if defined(_SINGLE_DOUBLE) || defined(_DOUBLE_DOUBLE)
  if (!gpu->double_precision())
    return -16;
  #endif
  // Setup auto bin size calculation for calls from atom::sort
  // - This is repeated in neighbor init with additional info
  if (_user_cell_size<0.0) {
@ -348,7 +354,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
 }
 template <class numtyp, class acctyp>
-int DeviceT::set_ocl_params(std::string s_config, std::string extra_args) {
+int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args) {
  #ifdef USE_OPENCL
  #include "lal_pre_ocl_config.h"
@ -368,7 +374,7 @@ int DeviceT::set_ocl_params(std::string s_config, std::string extra_args) {
  int token_count=0;
  std::string params[18];
  char ocl_config[2048];
-  strcpy(ocl_config,s_config.c_str());
+  strncpy(ocl_config,s_config.c_str(),2047);
  char *pch = strtok(ocl_config,",");
  _ocl_config_name=pch;
  pch = strtok(nullptr,",");
@ -546,14 +552,9 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
    return -3;
  if (_user_cell_size<0.0) {
    #ifndef LAL_USE_OLD_NEIGHBOR
    _neighbor_shared.setup_auto_cell_size(true,cutoff,nbor->simd_size());
    #else
    _neighbor_shared.setup_auto_cell_size(false,cutoff,nbor->simd_size());
    #endif
  } else
-    _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,
+    _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,nbor->simd_size());
                                          nbor->simd_size());
  nbor->set_cutoff(cutoff);
  return 0;
@ -992,10 +993,8 @@ int DeviceT::compile_kernels() {
      static_cast<size_t>(_block_cell_2d) > gpu->group_size_dim(0) ||
      static_cast<size_t>(_block_cell_2d) > gpu->group_size_dim(1) ||
      static_cast<size_t>(_block_cell_id) > gpu->group_size_dim(0) ||
-      static_cast<size_t>(_max_shared_types*_max_shared_types*
+      static_cast<size_t>(_max_shared_types*_max_shared_types*sizeof(numtyp)*17 > gpu->slm_size()) ||
-                          sizeof(numtyp)*17 > gpu->slm_size()) ||
+      static_cast<size_t>(_max_bio_shared_types*2*sizeof(numtyp) > gpu->slm_size()))
      static_cast<size_t>(_max_bio_shared_types*2*sizeof(numtyp) >
                          gpu->slm_size()))
    return -13;
  if (_block_pair % _simd_size != 0 || _block_bio_pair % _simd_size != 0 ||
@ -1071,9 +1070,8 @@ void lmp_clear_device() {
  global_device.clear_device();
 }
-double lmp_gpu_forces(double **f, double **tor, double *eatom,
+double lmp_gpu_forces(double **f, double **tor, double *eatom, double **vatom,
-                      double **vatom, double *virial, double &ecoul,
+                      double *virial, double &ecoul, int &error_flag) {
                      int &error_flag) {
  return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul,error_flag);
 }
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -163,17 +163,15 @@ class Device {
    { ans_queue.push(ans); }
  /// Add "answers" (force,energies,etc.) into LAMMPS structures
-  inline double fix_gpu(double **f, double **tor, double *eatom,
+  inline double fix_gpu(double **f, double **tor, double *eatom, double **vatom,
-                        double **vatom, double *virial, double &ecoul,
+                        double *virial, double &ecoul, int &error_flag) {
                        int &error_flag) {
    error_flag=0;
    atom.data_unavail();
    if (ans_queue.empty()==false) {
      stop_host_timer();
      double evdw=0.0;
      while (ans_queue.empty()==false) {
-        evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul,
+        evdw += ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul,error_flag);
                                             error_flag);
        ans_queue.pop();
      }
      return evdw;
@ -350,7 +348,7 @@ class Device {
  int _data_in_estimate, _data_out_estimate;
  std::string _ocl_config_name, _ocl_config_string, _ocl_compile_string;
-  int set_ocl_params(std::string, std::string);
+  int set_ocl_params(std::string, const std::string &);
 };
 }
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@ -39,7 +39,7 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
                    const int block_cell_2d, const int block_cell_id,
                    const int block_nbor_build, const int threads_per_atom,
                    const int simd_size, const bool time_device,
-                    const std::string compile_flags, const bool ilist_map) {
+                    const std::string &compile_flags, const bool ilist_map) {
  clear();
  _ilist_map = ilist_map;
@ -743,7 +743,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
    mn = _max_nbors;
    const numtyp i_cell_size=static_cast<numtyp>(1.0/_cell_size);
    const int neigh_block=_block_cell_id;
-    const int GX=(int)ceil((float)nall/neigh_block);
+    const int GX=(int)ceil((double)nall/neigh_block);
    const numtyp sublo0=static_cast<numtyp>(sublo[0]);
    const numtyp sublo1=static_cast<numtyp>(sublo[1]);
    const numtyp sublo2=static_cast<numtyp>(sublo[2]);
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@ -71,7 +71,7 @@ class Neighbor {
            const int block_cell_2d, const int block_cell_id,
            const int block_nbor_build, const int threads_per_atom,
            const int simd_size, const bool time_device,
-            const std::string compile_flags, const bool ilist_map);
+            const std::string &compile_flags, const bool ilist_map);
  /// Set the cutoff+skin
  inline void set_cutoff(const double cutoff) {
--- a/lib/gpu/lal_neighbor_shared.cpp
+++ b/lib/gpu/lal_neighbor_shared.cpp
@ -89,7 +89,7 @@ double NeighborShared::best_cell_size(const double subx, const double suby,
 }
 void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor,
-                                     const std::string flags) {
+                                     const std::string &flags) {
  if (_compiled)
          return;
--- a/lib/gpu/lal_neighbor_shared.h
+++ b/lib/gpu/lal_neighbor_shared.h
@ -87,7 +87,7 @@ class NeighborShared {
  /// Compile kernels for neighbor lists
  void compile_kernels(UCL_Device &dev, const int gpu_nbor,
-                       const std::string flags);
+                       const std::string &flags);
  // ----------------------------- Kernels
  UCL_Program *nbor_program, *build_program;
--- a/lib/gpu/lal_pppm.cpp
+++ b/lib/gpu/lal_pppm.cpp
@ -69,14 +69,14 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
  flag=device->init(*ans,nlocal,nall);
  if (flag!=0)
-    return 0;
+    return nullptr;
  if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) {
    flag=-15;
-    return 0;
+    return nullptr;
  }
  if (device->ptx_arch()>0.0 && device->ptx_arch()<1.1) {
    flag=-4;
-    return 0;
+    return nullptr;
  }
  ucl_device=device->gpu;
@ -168,7 +168,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
                                       UCL_READ_WRITE)==UCL_SUCCESS);
  if (!success) {
    flag=-3;
-    return 0;
+    return nullptr;
  }
  error_flag.device.zero();
@ -342,13 +342,15 @@ void PPPMT::interp(const grdtyp qqrd2e_scale) {
  vd_brick.update_device(true);
  time_in.stop();
  int ainum=this->ans->inum();
  if (ainum==0)
    return;
  time_interp.start();
  // Compute the block size and grid size to keep all cores busy
  int BX=this->block_size();
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
  int ainum=this->ans->inum();
  k_interp.set_size(GX,BX);
  k_interp.run(&atom->x, &atom->q, &ainum, &vd_brick, &d_rho_coeff,
               &_npts_x, &_npts_yx, &_brick_x, &_brick_y, &_brick_z, &_delxinv,
--- a/lib/pace/Makefile
+++ b/lib/pace/Makefile
@ -2,8 +2,8 @@ SHELL = /bin/sh
 # ------ FILES ------
-SRC_FILES = $(wildcard src/ML-PACE/*.cpp)
+SRC_FILES = $(wildcard src/USER-PACE/*.cpp)
-SRC = $(filter-out src/ML-PACE/pair_pace.cpp, $(SRC_FILES))
+SRC = $(filter-out src/USER-PACE/pair_pace.cpp, $(SRC_FILES))
 # ------ DEFINITIONS ------
@ -12,7 +12,7 @@ OBJ =   $(SRC:.cpp=.o)
 # ------ SETTINGS ------
-CXXFLAGS = -O3 -fPIC -Isrc/ML-PACE
+CXXFLAGS = -O3 -fPIC -Isrc/USER-PACE
 ARCHIVE =	ar
 ARCHFLAG =	-rc
--- a/lib/pace/Makefile.lammps
+++ b/lib/pace/Makefile.lammps
@ -1,3 +1,3 @@
-pace_SYSINC =-I../../lib/pace/src/ML-PACE
+pace_SYSINC =-I../../lib/pace/src/USER-PACE
 pace_SYSLIB = -L../../lib/pace/ -lpace
 pace_SYSPATH =
--- a/python/lammps/numpy_wrapper.py
+++ b/python/lammps/numpy_wrapper.py
@ -92,8 +92,12 @@ class numpy_wrapper:
    if dim == LAMMPS_AUTODETECT:
      if dtype in (LAMMPS_INT_2D, LAMMPS_DOUBLE_2D, LAMMPS_INT64_2D):
        # TODO add other fields
-        if name in ("x", "v", "f", "angmom", "torque", "csforce", "vforce"):
+        if name in ("x", "v", "f", "x0", "omega", "angmom", "torque", "vforce", "vest"):
          dim = 3
        elif name == "smd_data_9":
          dim = 9
        elif name == "smd_stress":
          dim = 6
        else:
          dim = 2
      else:
@ -386,6 +390,9 @@ class numpy_wrapper:
  # -------------------------------------------------------------------------
  def iarray(self, c_int_type, raw_ptr, nelem, dim=1):
    if raw_ptr is None:
      return None
    import numpy as np
    np_int_type = self._ctype_to_numpy_int(c_int_type)
@ -405,7 +412,11 @@ class numpy_wrapper:
  # -------------------------------------------------------------------------
  def darray(self, raw_ptr, nelem, dim=1):
    if raw_ptr is None:
      return None
    import numpy as np
    if dim == 1:
      ptr = cast(raw_ptr, POINTER(c_double * nelem))
    else:
--- a/src/.gitignore
+++ b/src/.gitignore
@ -860,8 +860,6 @@
 /fix_ti_rs.h
 /fix_ti_spring.cpp
 /fix_ti_spring.h
 /fix_ttm.cpp
 /fix_ttm.h
 /fix_tune_kspace.cpp
 /fix_tune_kspace.h
 /fix_wall_body_polygon.cpp
@ -921,6 +919,7 @@
 /improper_ring.h
 /improper_umbrella.cpp
 /improper_umbrella.h
 /interlayer_taper.h
 /kissfft.h
 /lj_sdk_common.h
 /math_complex.h
@ -935,7 +934,6 @@
 /msm_cg.h
 /neb.cpp
 /neb.h
 /pair_adp.cpp
 /pair_adp.h
 /pair_agni.cpp
@ -996,6 +994,8 @@
 /pair_cosine_squared.h
 /pair_coul_diel.cpp
 /pair_coul_diel.h
 /pair_coul_exclude.cpp
 /pair_coul_exclude.h
 /pair_coul_long.cpp
 /pair_coul_long.h
 /pair_coul_msm.cpp
@ -1433,6 +1433,10 @@
 /fix_srp.h
 /fix_tfmc.cpp
 /fix_tfmc.h
 /fix_ttm.cpp
 /fix_ttm.h
 /fix_ttm_grid.cpp
 /fix_ttm_grid.h
 /fix_ttm_mod.cpp
 /fix_ttm_mod.h
 /pair_born_coul_long_cs.cpp
--- a/src/ASPHERE/pair_line_lj.cpp
+++ b/src/ASPHERE/pair_line_lj.cpp
@ -301,8 +301,7 @@ void PairLineLJ::compute(int eflag, int vflag)
        }
      }
-      if (evflag) ev_tally(i,j,nlocal,newton_pair,
+      if (evflag) ev_tally(i,j,nlocal,newton_pair,evdwl,0.0,fpair,delx,dely,delz);
                           evdwl,0.0,fpair,delx,dely,delz);
    }
  }
--- a/src/ASPHERE/pair_tri_lj.cpp
+++ b/src/ASPHERE/pair_tri_lj.cpp
@ -375,8 +375,7 @@ void PairTriLJ::compute(int eflag, int vflag)
        }
      }
-      if (evflag) ev_tally(i,j,nlocal,newton_pair,
+      if (evflag) ev_tally(i,j,nlocal,newton_pair,evdwl,0.0,fpair,delx,dely,delz);
                           evdwl,0.0,fpair,delx,dely,delz);
    }
  }
--- a/src/AWPMD/pair_awpmd_cut.cpp
+++ b/src/AWPMD/pair_awpmd_cut.cpp
@ -39,7 +39,6 @@
 #include <cstring>
 #include <map>
 #include <utility>
 #include <vector>
 using namespace LAMMPS_NS;
--- a/src/BOCS/fix_bocs.cpp
+++ b/src/BOCS/fix_bocs.cpp
@ -37,7 +37,6 @@
 #include <cmath>
 #include <cstring>
 #include <vector>
 using namespace LAMMPS_NS;
 using namespace FixConst;
@ -234,9 +233,7 @@ FixBocs::FixBocs(LAMMPS *lmp, int narg, char **arg) :
      iarg += 2;
    } else if (strcmp(arg[iarg],"mtk") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal fix bocs command");
-      if (strcmp(arg[iarg+1],"yes") == 0) mtk_flag = 1;
+      mtk_flag = utils::logical(FLERR,arg[iarg+1],false,lmp);
      else if (strcmp(arg[iarg+1],"no") == 0) mtk_flag = 0;
      else error->all(FLERR,"Illegal fix bocs command");
      iarg += 2;
    } else if (strcmp(arg[iarg],"tloop") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal fix bocs command");
--- a/src/BODY/fix_wall_body_polygon.cpp
+++ b/src/BODY/fix_wall_body_polygon.cpp
@ -337,8 +337,7 @@ void FixWallBodyPolygon::post_force(int /*vflag*/)
      num_contacts = 0;
      facc[0] = facc[1] = facc[2] = 0;
-      vertex_against_wall(i, wall_pos, x, f, torque, side,
+      vertex_against_wall(i, wall_pos, x, f, torque, side, contact_list, num_contacts, facc);
                          contact_list, num_contacts, facc);
      if (num_contacts >= 2) {
--- a/src/BODY/pair_body_nparticle.cpp
+++ b/src/BODY/pair_body_nparticle.cpp
@ -324,8 +324,7 @@ void PairBodyNparticle::compute(int eflag, int vflag)
        }
      }
-      if (evflag) ev_tally(i,j,nlocal,newton_pair,
+      if (evflag) ev_tally(i,j,nlocal,newton_pair,evdwl,0.0,fpair,delx,dely,delz);
                           evdwl,0.0,fpair,delx,dely,delz);
    }
  }
--- a/src/BODY/pair_body_rounded_polygon.cpp
+++ b/src/BODY/pair_body_rounded_polygon.cpp
@ -207,8 +207,7 @@ void PairBodyRoundedPolygon::compute(int eflag, int vflag)
      if (r > radi + radj + cut_inner) continue;
      if (npi == 1 && npj == 1) {
-        sphere_against_sphere(i, j, delx, dely, delz, rsq,
+        sphere_against_sphere(i, j, delx, dely, delz, rsq, k_nij, k_naij, x, v, f, evflag);
                            k_nij, k_naij, x, v, f, evflag);
        continue;
      }
--- a/src/BROWNIAN/fix_brownian.cpp
+++ b/src/BROWNIAN/fix_brownian.cpp
@ -20,17 +20,11 @@
 #include "fix_brownian.h"
 #include "atom.h"
 #include "comm.h"
 #include "domain.h"
 #include "error.h"
 #include "force.h"
 #include "math_extra.h"
 #include "memory.h"
 #include "random_mars.h"
 #include "update.h"
 #include <cmath>
 #include <cstring>
 using namespace LAMMPS_NS;
 using namespace FixConst;
--- a/src/BROWNIAN/fix_brownian_asphere.cpp
+++ b/src/BROWNIAN/fix_brownian_asphere.cpp
@ -21,17 +21,10 @@
 #include "atom.h"
 #include "atom_vec_ellipsoid.h"
 #include "comm.h"
 #include "domain.h"
 #include "error.h"
 #include "force.h"
 #include "math_extra.h"
 #include "memory.h"
 #include "random_mars.h"
 #include "update.h"
 #include <cmath>
 #include <cstring>
 using namespace LAMMPS_NS;
 using namespace FixConst;
--- a/src/BROWNIAN/fix_brownian_base.cpp
+++ b/src/BROWNIAN/fix_brownian_base.cpp
@ -17,15 +17,12 @@
   Contributing author: Sam Cameron (University of Bristol)
 ------------------------------------------------------------------------- */
-#include "fix_brownian.h"
+#include "fix_brownian_base.h"
 #include "atom.h"
 #include "comm.h"
 #include "domain.h"
 #include "error.h"
 #include "force.h"
 #include "math_extra.h"
 #include "memory.h"
 #include "random_mars.h"
 #include "update.h"
--- a/src/BROWNIAN/fix_brownian_sphere.cpp
+++ b/src/BROWNIAN/fix_brownian_sphere.cpp
@ -20,17 +20,12 @@
 #include "fix_brownian_sphere.h"
 #include "atom.h"
 #include "comm.h"
 #include "domain.h"
 #include "error.h"
 #include "force.h"
 #include "math_extra.h"
 #include "memory.h"
 #include "random_mars.h"
 #include "update.h"
 #include <cmath>
 #include <cstring>
 using namespace LAMMPS_NS;
 using namespace FixConst;
--- a/src/BROWNIAN/fix_propel_self.cpp
+++ b/src/BROWNIAN/fix_propel_self.cpp
@ -23,14 +23,11 @@
 #include "atom.h"
 #include "atom_vec_ellipsoid.h"
 #include "comm.h"
 #include "domain.h"
 #include "error.h"
 #include "force.h"
 #include "math_extra.h"
 #include "memory.h"
 #include "update.h"
 #include <cmath>
 #include <cstring>
 using namespace LAMMPS_NS;
--- a/src/CG-DNA/atom_vec_oxdna.cpp
+++ b/src/CG-DNA/atom_vec_oxdna.cpp
@ -14,7 +14,6 @@
 #include "atom_vec_oxdna.h"
 #include "atom.h"
 #include "comm.h"
 #include "error.h"
 #include "force.h"
--- a/src/CG-DNA/pair_oxdna_stk.cpp
+++ b/src/CG-DNA/pair_oxdna_stk.cpp
@ -19,7 +19,6 @@
 #include "atom.h"
 #include "atom_vec_ellipsoid.h"
 #include "atom_vec_oxdna.h"
 #include "comm.h"
 #include "error.h"
 #include "force.h"
@ -30,7 +29,6 @@
 #include <cmath>
 #include <cstring>
 #include <utility>
 using namespace LAMMPS_NS;
 using namespace MFOxdna;
--- a/src/CG-DNA/pair_oxrna2_excv.cpp
+++ b/src/CG-DNA/pair_oxrna2_excv.cpp
@ -17,8 +17,6 @@
 #include "pair_oxrna2_excv.h"
 #include <cstring>
 using namespace LAMMPS_NS;
 /* ----------------------------------------------------------------------
--- a/src/CG-DNA/pair_oxrna2_stk.cpp
+++ b/src/CG-DNA/pair_oxrna2_stk.cpp
@ -19,7 +19,6 @@
 #include "atom.h"
 #include "atom_vec_ellipsoid.h"
 #include "atom_vec_oxdna.h"
 #include "comm.h"
 #include "error.h"
 #include "force.h"
@ -31,7 +30,6 @@
 #include <cmath>
 #include <cstring>
 #include <utility>
 using namespace LAMMPS_NS;
 using namespace MathConst;
--- a/src/CG-SDK/pair_lj_sdk_coul_msm.cpp
+++ b/src/CG-SDK/pair_lj_sdk_coul_msm.cpp
@ -18,13 +18,15 @@
 ------------------------------------------------------------------------- */
 #include "pair_lj_sdk_coul_msm.h"
-#include <cmath>
+
 #include <cstring>
 #include "atom.h"
 #include "error.h"
 #include "force.h"
 #include "kspace.h"
 #include "neigh_list.h"
-#include "error.h"
+
 #include <cmath>
 #include <cstring>
 #include "lj_sdk_common.h"
--- a/src/CLASS2/dihedral_class2.cpp
+++ b/src/CLASS2/dihedral_class2.cpp
@ -25,7 +25,6 @@
 #include "math_const.h"
 #include "memory.h"
 #include "neighbor.h"
 #include "update.h"
 #include <cmath>
 #include <cstring>
--- a/src/CLASS2/improper_class2.cpp
+++ b/src/CLASS2/improper_class2.cpp
@ -25,7 +25,6 @@
 #include "math_const.h"
 #include "memory.h"
 #include "neighbor.h"
 #include "update.h"
 #include <cmath>
 #include <cstring>
--- a/src/CLASS2/pair_lj_class2.cpp
+++ b/src/CLASS2/pair_lj_class2.cpp
@ -397,8 +397,7 @@ void PairLJClass2::compute_outer(int eflag, int vflag)
            fpair = factor_lj*forcelj*r2inv;
        }
-        if (evflag) ev_tally(i,j,nlocal,newton_pair,
+        if (evflag) ev_tally(i,j,nlocal,newton_pair,evdwl,0.0,fpair,delx,dely,delz);
                             evdwl,0.0,fpair,delx,dely,delz);
      }
    }
  }
--- a/src/COLLOID/fix_wall_colloid.cpp
+++ b/src/COLLOID/fix_wall_colloid.cpp
@ -18,10 +18,11 @@
 #include "fix_wall_colloid.h"
 #include <cmath>
 #include "atom.h"
 #include "error.h"
 #include <cmath>
 using namespace LAMMPS_NS;
 using namespace FixConst;
--- a/src/COLVARS/colvarproxy_lammps.cpp
+++ b/src/COLVARS/colvarproxy_lammps.cpp
@ -10,13 +10,6 @@
 #include "colvarproxy_lammps.h"
 #include <mpi.h>
 #include <sys/stat.h>
 #include <cerrno>
 #include <cstring>
 #include <iostream>
 #include <memory>
 #include <string>
 #include "lammps.h"
 #include "error.h"
@ -26,6 +19,12 @@
 #include "colvarmodule.h"
 #include "colvarproxy.h"
 #include <sys/stat.h>
 #include <cerrno>
 #include <cstring>
 #include <iostream>
 #include <memory>
 #define HASH_FAIL  -1
 ////////////////////////////////////////////////////////////////////////
--- a/src/COLVARS/colvarproxy_lammps.h
+++ b/src/COLVARS/colvarproxy_lammps.h
@ -12,11 +12,6 @@
 #include "colvarproxy_lammps_version.h"    // IWYU pragma: export
 #include <cstddef>
 #include <mpi.h>
 #include <string>
 #include <vector>
 #include "colvarmodule.h"
 #include "colvarproxy.h"
 #include "colvartypes.h"
--- a/Show More
+++ b/Show More