Merge pull request #3122 from akohlmey/maintenance-2021-09-29

Third round of maintenance fixes for the stable release
fix temperature initialization bug in KOKKOS nose-hoover code
2022-03-24 14:20:52 -04:00 · 2022-03-24 11:44:24 -04:00 · 2022-03-24 07:38:44 -04:00 · 2022-03-24 07:31:02 -04:00 · 2022-03-24 07:18:24 -04:00 · 2022-03-24 06:09:03 -04:00
280 changed files with 5790 additions and 2694 deletions
--- a/cmake/Modules/LAMMPSUtils.cmake
+++ b/cmake/Modules/LAMMPSUtils.cmake
@ -25,7 +25,7 @@ function(validate_option name values)
 endfunction(validate_option)

 function(get_lammps_version version_header variable)
-    file(READ ${version_header} line)
+    file(STRINGS ${version_header} line REGEX LAMMPS_VERSION)
    set(MONTHS x Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)
    string(REGEX REPLACE "#define LAMMPS_VERSION \"([0-9]+) ([A-Za-z]+) ([0-9]+)\"" "\\1" day "${line}")
    string(REGEX REPLACE "#define LAMMPS_VERSION \"([0-9]+) ([A-Za-z]+) ([0-9]+)\"" "\\2" month "${line}")
--- a/cmake/Modules/Packages/COMPRESS.cmake
+++ b/cmake/Modules/Packages/COMPRESS.cmake
@ -1,10 +1,11 @@
 find_package(ZLIB REQUIRED)
 target_link_libraries(lammps PRIVATE ZLIB::ZLIB)

-find_package(PkgConfig REQUIRED)
-pkg_check_modules(Zstd IMPORTED_TARGET libzstd>=1.4)
-
-if(Zstd_FOUND)
+find_package(PkgConfig QUIET)
+if(PkgConfig_FOUND)
+  pkg_check_modules(Zstd IMPORTED_TARGET libzstd>=1.4)
+  if(Zstd_FOUND)
    target_compile_definitions(lammps PRIVATE -DLAMMPS_ZSTD)
    target_link_libraries(lammps PRIVATE PkgConfig::Zstd)
+  endif()
 endif()
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@ -30,7 +30,15 @@ file(GLOB GPU_LIB_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp)
 file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)

 if(GPU_API STREQUAL "CUDA")
-  find_package(CUDA REQUIRED)
+  find_package(CUDA QUIET)
+  # augment search path for CUDA toolkit libraries to include the stub versions. Needed to find libcuda.so on machines without a CUDA driver installation
+  if(CUDA_FOUND)
+    set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}")
+    find_package(CUDA REQUIRED)
+  else()
+    message(FATAL_ERROR "CUDA Toolkit not found")
+  endif()
+
  find_program(BIN2C bin2c)
  if(NOT BIN2C)
    message(FATAL_ERROR "Could not find bin2c, use -DBIN2C=/path/to/bin2c to help cmake finding it.")
@ -217,13 +225,20 @@ elseif(GPU_API STREQUAL "OPENCL")
 elseif(GPU_API STREQUAL "HIP")
  if(NOT DEFINED HIP_PATH)
      if(NOT DEFINED ENV{HIP_PATH})
-          set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+          set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to HIP installation")
      else()
-          set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+          set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to HIP installation")
      endif()
  endif()
-  set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
-  find_package(HIP REQUIRED)
+  if(NOT DEFINED ROCM_PATH)
+      if(NOT DEFINED ENV{ROCM_PATH})
+          set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation")
+      else()
+          set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to ROCm installation")
+      endif()
+  endif()
+  list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH})
+  find_package(hip REQUIRED)
  option(HIP_USE_DEVICE_SORT "Use GPU sorting" ON)

  if(NOT DEFINED HIP_PLATFORM)
@ -299,12 +314,12 @@ elseif(GPU_API STREQUAL "HIP")

        if(HIP_COMPILER STREQUAL "clang")
          add_custom_command(OUTPUT ${CUBIN_FILE}
-            VERBATIM COMMAND ${HIP_HIPCC_EXECUTABLE} --genco --offload-arch=${HIP_ARCH} -O3 -ffast-math -DUSE_HIP -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES} -I${LAMMPS_LIB_SOURCE_DIR}/gpu -o ${CUBIN_FILE} ${CU_CPP_FILE}
+            VERBATIM COMMAND ${HIP_HIPCC_EXECUTABLE} --genco --offload-arch=${HIP_ARCH} -O3 -DUSE_HIP -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES} -I${LAMMPS_LIB_SOURCE_DIR}/gpu -o ${CUBIN_FILE} ${CU_CPP_FILE}
            DEPENDS ${CU_CPP_FILE}
            COMMENT "Generating ${CU_NAME}.cubin")
        else()
          add_custom_command(OUTPUT ${CUBIN_FILE}
-            VERBATIM COMMAND ${HIP_HIPCC_EXECUTABLE} --genco -t="${HIP_ARCH}" -f=\"-O3 -ffast-math -DUSE_HIP -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES} -I${LAMMPS_LIB_SOURCE_DIR}/gpu\" -o ${CUBIN_FILE} ${CU_CPP_FILE}
+            VERBATIM COMMAND ${HIP_HIPCC_EXECUTABLE} --genco -t="${HIP_ARCH}" -f=\"-O3 -DUSE_HIP -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES} -I${LAMMPS_LIB_SOURCE_DIR}/gpu\" -o ${CUBIN_FILE} ${CU_CPP_FILE}
            DEPENDS ${CU_CPP_FILE}
            COMMENT "Generating ${CU_NAME}.cubin")
        endif()
@ -325,10 +340,11 @@ elseif(GPU_API STREQUAL "HIP")

  set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h ${LAMMPS_LIB_BINARY_DIR}/gpu/*.cu.cpp")

-  hip_add_library(gpu STATIC ${GPU_LIB_SOURCES})
+  add_library(gpu STATIC ${GPU_LIB_SOURCES})
  target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu)
  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
  target_compile_definitions(gpu PRIVATE -DUSE_HIP)
+  target_link_libraries(gpu PRIVATE hip::host)

  if(HIP_USE_DEVICE_SORT)
    # add hipCUB
@ -377,8 +393,9 @@ elseif(GPU_API STREQUAL "HIP")
    endif()
  endif()

-  hip_add_executable(hip_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
+  add_executable(hip_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
  target_compile_definitions(hip_get_devices PRIVATE -DUCL_HIP)
+  target_link_libraries(hip_get_devices hip::host)

  if(HIP_PLATFORM STREQUAL "nvcc")
    target_compile_definitions(gpu PRIVATE -D__HIP_PLATFORM_NVCC__)
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@ -1,6 +1,8 @@
 ########################################################################
 # As of version 3.3.0 Kokkos requires C++14
-set(CMAKE_CXX_STANDARD 14)
+if(CMAKE_CXX_STANDARD LESS 14)
+  set(CMAKE_CXX_STANDARD 14)
+endif()
 ########################################################################
 # consistency checks and Kokkos options/settings required by LAMMPS
 if(Kokkos_ENABLE_CUDA)
--- a/cmake/Modules/Packages/LATTE.cmake
+++ b/cmake/Modules/Packages/LATTE.cmake
@ -19,6 +19,14 @@ if(DOWNLOAD_LATTE)
  set(LATTE_MD5 "820e73a457ced178c08c71389a385de7" CACHE STRING "MD5 checksum of LATTE tarball")
  mark_as_advanced(LATTE_URL)
  mark_as_advanced(LATTE_MD5)
+
+  # CMake cannot pass BLAS or LAPACK library variable to external project if they are a list
+  list(LENGTH BLAS_LIBRARIES} NUM_BLAS)
+  list(LENGTH LAPACK_LIBRARIES NUM_LAPACK)
+  if((NUM_BLAS GREATER 1) OR (NUM_LAPACK GREATER 1))
+    message(FATAL_ERROR "Cannot compile downloaded LATTE library due to a technical limitation")
+  endif()
+
  include(ExternalProject)
  ExternalProject_Add(latte_build
    URL     ${LATTE_URL}
--- a/cmake/Modules/Packages/MACHDYN.cmake
+++ b/cmake/Modules/Packages/MACHDYN.cmake
@ -7,8 +7,9 @@ endif()
 option(DOWNLOAD_EIGEN3 "Download Eigen3 instead of using an already installed one)" ${DOWNLOAD_EIGEN3_DEFAULT})
 if(DOWNLOAD_EIGEN3)
  message(STATUS "Eigen3 download requested - we will build our own")
-  set(EIGEN3_URL "https://gitlab.com/libeigen/eigen/-/archive/3.3.9/eigen-3.3.9.tar.gz" CACHE STRING "URL for Eigen3 tarball")
-  set(EIGEN3_MD5 "609286804b0f79be622ccf7f9ff2b660" CACHE STRING "MD5 checksum of Eigen3 tarball")
+
+  set(EIGEN3_URL "https://download.lammps.org/thirdparty/eigen-3.4.0.tar.gz" CACHE STRING "URL for Eigen3 tarball")
+  set(EIGEN3_MD5 "4c527a9171d71a72a9d4186e65bea559" CACHE STRING "MD5 checksum of Eigen3 tarball")
  mark_as_advanced(EIGEN3_URL)
  mark_as_advanced(EIGEN3_MD5)
  include(ExternalProject)
--- a/cmake/Modules/Packages/ML-HDNNP.cmake
+++ b/cmake/Modules/Packages/ML-HDNNP.cmake
@ -45,12 +45,12 @@ if(DOWNLOAD_N2P2)
    # get path to MPI include directory when cross-compiling to windows
    if((CMAKE_SYSTEM_NAME STREQUAL Windows) AND CMAKE_CROSSCOMPILING)
      get_target_property(N2P2_MPI_INCLUDE MPI::MPI_CXX INTERFACE_INCLUDE_DIRECTORIES)
-      set(N2P2_PROJECT_OPTIONS "-I ${N2P2_MPI_INCLUDE} -DMPICH_SKIP_MPICXX=1")
+      set(N2P2_PROJECT_OPTIONS "-I${N2P2_MPI_INCLUDE}")
      set(MPI_CXX_COMPILER ${CMAKE_CXX_COMPILER})
    endif()
    if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
      get_target_property(N2P2_MPI_INCLUDE MPI::MPI_CXX INTERFACE_INCLUDE_DIRECTORIES)
-      set(N2P2_PROJECT_OPTIONS "-I ${N2P2_MPI_INCLUDE} -DMPICH_SKIP_MPICXX=1")
+      set(N2P2_PROJECT_OPTIONS "-I${N2P2_MPI_INCLUDE}")
      set(MPI_CXX_COMPILER ${CMAKE_CXX_COMPILER})
    endif()
  endif()
@ -69,6 +69,12 @@ if(DOWNLOAD_N2P2)
  # echo final flag for debugging
  message(STATUS "N2P2 BUILD OPTIONS: ${N2P2_BUILD_OPTIONS}")

+  # must have "sed" command to compile n2p2 library (for now)
+  find_program(HAVE_SED sed)
+  if(NOT HAVE_SED)
+    message(FATAL_ERROR "Must have 'sed' program installed to compile 'n2p2' library for ML-HDNNP package")
+  endif()
+
  # download compile n2p2 library. much patch MPI calls in LAMMPS interface to accommodate MPI-2 (e.g. for cross-compiling)
  include(ExternalProject)
  ExternalProject_Add(n2p2_build
--- a/cmake/Modules/Packages/ML-QUIP.cmake
+++ b/cmake/Modules/Packages/ML-QUIP.cmake
@ -32,7 +32,8 @@ if(DOWNLOAD_QUIP)
  foreach(flag ${LAPACK_LIBRARIES})
    set(temp "${temp} ${flag}")
  endforeach()
-  set(temp "${temp}\n")
+  # Fix cmake crashing when MATH_LINKOPTS not set, required for e.g. recent Cray Programming Environment
+  set(temp "${temp} -L/_DUMMY_PATH_\n")
  set(temp "${temp}PYTHON=python\nPIP=pip\nEXTRA_LINKOPTS=\n")
  set(temp "${temp}HAVE_CP2K=0\nHAVE_VASP=0\nHAVE_TB=0\nHAVE_PRECON=1\nHAVE_LOTF=0\nHAVE_ONIOM=0\n")
  set(temp "${temp}HAVE_LOCAL_E_MIX=0\nHAVE_QC=0\nHAVE_GAP=1\nHAVE_DESCRIPTORS_NONCOMMERCIAL=1\n")
@ -50,7 +51,8 @@ if(DOWNLOAD_QUIP)
    GIT_TAG origin/public
    GIT_SHALLOW YES
    GIT_PROGRESS YES
-    PATCH_COMMAND cp ${CMAKE_BINARY_DIR}/quip.config <SOURCE_DIR>/arch/Makefile.lammps
+    GIT_SUBMODULES "src/fox;src/GAP"
+    PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_BINARY_DIR}/quip.config <SOURCE_DIR>/arch/Makefile.lammps
    CONFIGURE_COMMAND env QUIP_ARCH=lammps make config
    BUILD_COMMAND env QUIP_ARCH=lammps make libquip
    INSTALL_COMMAND ""
--- a/cmake/Modules/Packages/MSCG.cmake
+++ b/cmake/Modules/Packages/MSCG.cmake
@ -12,6 +12,13 @@ if(DOWNLOAD_MSCG)
  mark_as_advanced(MSCG_URL)
  mark_as_advanced(MSCG_MD5)

+  # CMake cannot pass BLAS or LAPACK library variable to external project if they are a list
+  list(LENGTH BLAS_LIBRARIES} NUM_BLAS)
+  list(LENGTH LAPACK_LIBRARIES NUM_LAPACK)
+  if((NUM_BLAS GREATER 1) OR (NUM_LAPACK GREATER 1))
+    message(FATAL_ERROR "Cannot compile downloaded MSCG library due to a technical limitation")
+  endif()
+
  include(ExternalProject)
  ExternalProject_Add(mscg_build
    URL     ${MSCG_URL}
--- a/cmake/Modules/Packages/PLUMED.cmake
+++ b/cmake/Modules/Packages/PLUMED.cmake
@ -54,8 +54,8 @@ if(DOWNLOAD_PLUMED)
    set(PLUMED_BUILD_BYPRODUCTS "<INSTALL_DIR>/lib/libplumedWrapper.a")
  endif()

-  set(PLUMED_URL "https://github.com/plumed/plumed2/releases/download/v2.7.2/plumed-src-2.7.2.tgz" CACHE STRING "URL for PLUMED tarball")
-  set(PLUMED_MD5 "cfa0b4dd90a81c25d3302e8d97bfeaea" CACHE STRING "MD5 checksum of PLUMED tarball")
+  set(PLUMED_URL "https://github.com/plumed/plumed2/releases/download/v2.7.4/plumed-src-2.7.4.tgz" CACHE STRING "URL for PLUMED tarball")
+  set(PLUMED_MD5 "858e0b6aed173748fc85b6bc8a9dcb3e" CACHE STRING "MD5 checksum of PLUMED tarball")

  mark_as_advanced(PLUMED_URL)
  mark_as_advanced(PLUMED_MD5)
--- a/cmake/Modules/Packages/SCAFACOS.cmake
+++ b/cmake/Modules/Packages/SCAFACOS.cmake
@ -23,6 +23,11 @@ if(DOWNLOAD_SCAFACOS)
  file(DOWNLOAD ${LAMMPS_THIRDPARTY_URL}/scafacos-1.0.1-fix.diff ${CMAKE_CURRENT_BINARY_DIR}/scafacos-1.0.1.fix.diff
          EXPECTED_HASH MD5=4baa1333bb28fcce102d505e1992d032)

+  find_program(HAVE_PATCH patch)
+  if(NOT HAVE_PATCH)
+    message(FATAL_ERROR "The 'patch' program is required to build the ScaFaCoS library")
+  endif()
+
  include(ExternalProject)
  ExternalProject_Add(scafacos_build
    URL     ${SCAFACOS_URL}
--- a/cmake/Modules/Packages/VORONOI.cmake
+++ b/cmake/Modules/Packages/VORONOI.cmake
@ -26,6 +26,11 @@ if(DOWNLOAD_VORO)
    set(VORO_BUILD_OPTIONS CXX=${CMAKE_CXX_COMPILER} CFLAGS=${VORO_BUILD_CFLAGS})
  endif()

+  find_program(HAVE_PATCH patch)
+  if(NOT HAVE_PATCH)
+    message(FATAL_ERROR "The 'patch' program is required to build the voro++ library")
+  endif()
+
  ExternalProject_Add(voro_build
    URL     ${VORO_URL}
    URL_MD5 ${VORO_MD5}
--- a/cmake/presets/hip_amd.cmake
+++ b/cmake/presets/hip_amd.cmake
@ -0,0 +1,30 @@
+# preset that will enable hip (clang/clang++) with support for MPI and OpenMP (on Linux boxes)
+
+# prefer flang over gfortran, if available
+find_program(CLANG_FORTRAN NAMES flang gfortran f95)
+set(ENV{OMPI_FC} ${CLANG_FORTRAN})
+
+set(CMAKE_CXX_COMPILER "hipcc" CACHE STRING "" FORCE)
+set(CMAKE_C_COMPILER "hipcc" CACHE STRING "" FORCE)
+set(CMAKE_Fortran_COMPILER ${CLANG_FORTRAN} CACHE STRING "" FORCE)
+set(CMAKE_CXX_FLAGS_DEBUG "-Wall -Wextra -g" CACHE STRING "" FORCE)
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Wall -Wextra -g -O2 -DNDEBUG" CACHE STRING "" FORCE)
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "" FORCE)
+set(CMAKE_Fortran_FLAGS_DEBUG "-Wall -Wextra -g -std=f2003" CACHE STRING "" FORCE)
+set(CMAKE_Fortran_FLAGS_RELWITHDEBINFO "-Wall -Wextra -g -O2 -DNDEBUG -std=f2003" CACHE STRING "" FORCE)
+set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -DNDEBUG -std=f2003" CACHE STRING "" FORCE)
+set(CMAKE_C_FLAGS_DEBUG "-Wall -Wextra -g" CACHE STRING "" FORCE)
+set(CMAKE_C_FLAGS_RELWITHDEBINFO "-Wall -Wextra -g -O2 -DNDEBUG" CACHE STRING "" FORCE)
+set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "" FORCE)
+
+set(MPI_CXX "hipcc" CACHE STRING "" FORCE)
+set(MPI_CXX_COMPILER "mpicxx" CACHE STRING "" FORCE)
+
+unset(HAVE_OMP_H_INCLUDE CACHE)
+set(OpenMP_C "hipcc" CACHE STRING "" FORCE)
+set(OpenMP_C_FLAGS "-fopenmp" CACHE STRING "" FORCE)
+set(OpenMP_C_LIB_NAMES "omp" CACHE STRING "" FORCE)
+set(OpenMP_CXX "hipcc" CACHE STRING "" FORCE)
+set(OpenMP_CXX_FLAGS "-fopenmp" CACHE STRING "" FORCE)
+set(OpenMP_CXX_LIB_NAMES "omp" CACHE STRING "" FORCE)
+set(OpenMP_omp_LIBRARY "libomp.so" CACHE PATH "" FORCE)
--- a/cmake/presets/most.cmake
+++ b/cmake/presets/most.cmake
@ -48,7 +48,6 @@ set(ALL_PACKAGES
  PHONON
  PLUGIN
  POEMS
-  PYTHON
  QEQ
  REACTION
  REAXFF
--- a/doc/Makefile
+++ b/doc/Makefile
@ -230,7 +230,7 @@ $(VENV):
 	)

 $(MATHJAX):
-	@git clone -b 3.2.0 -c advice.detachedHead=0 --depth 1 git://github.com/mathjax/MathJax.git $@
+	@git clone -b 3.2.0 -c advice.detachedHead=0 --depth 1 https://github.com/mathjax/MathJax.git $@

 $(ANCHORCHECK): $(VENV)
 	@( \
--- a/doc/github-development-workflow.md
+++ b/doc/github-development-workflow.md
@ -33,9 +33,9 @@ when necessary.
 ## Pull Requests

 ALL changes to the LAMMPS code and documentation, however trivial, MUST
-be submitted as a pull request to GitHub. All changes to the "master"
+be submitted as a pull request to GitHub. All changes to the "develop"
 branch must be made exclusively through merging pull requests. The
-"unstable" and "stable" branches, respectively are only to be updated
+"release" and "stable" branches, respectively are only to be updated
 upon patch or stable releases with fast-forward merges based on the
 associated tags. Pull requests may also be submitted to (long-running)
 feature branches created by LAMMPS developers inside the LAMMPS project,
@ -123,16 +123,16 @@ and thus were this comment should be placed.

 LAMMPS uses a continuous release development model with incremental
 changes, i.e. significant effort is made - including automated pre-merge
-testing - that the code in the branch "master" does not get easily
+testing - that the code in the branch "develop" does not get easily
 broken.  These tests are run after every update to a pull request.  More
 extensive and time consuming tests (including regression testing) are
-performed after code is merged to the "master" branch. There are patch
+performed after code is merged to the "develop" branch. There are patch
 releases of LAMMPS every 3-5 weeks at a point, when the LAMMPS
 developers feel, that a sufficient amount of changes have happened, and
 the post-merge testing has been successful. These patch releases are
-marked with a `patch_<version date>` tag and the "unstable" branch
+marked with a `patch_<version date>` tag and the "release" branch
 follows only these versions (and thus is always supposed to be of
-production quality, unlike "master", which may be temporary broken, in
+production quality, unlike "develop", which may be temporary broken, in
 the case of larger change sets or unexpected incompatibilities or side
 effects.

--- a/doc/lammps.1
+++ b/doc/lammps.1
@ -1,4 +1,4 @@
-.TH LAMMPS "20 September 2021" "2021-09-20"
+.TH LAMMPS "29 September 2021" "2021-09-29"
 .SH NAME
 .B LAMMPS
 \- Molecular Dynamics Simulator.
--- a/doc/src/Build_diskspace.rst
+++ b/doc/src/Build_diskspace.rst
@ -14,7 +14,7 @@ environments with restricted disk space capacity it may be needed to
 reduce the storage requirements. Here are some suggestions:

 - Create a so-called shallow repository by cloning only the last commit
-  instead of the full project history by using ``git clone git@github.com:lammps/lammps --depth=1 --branch=master``.
+  instead of the full project history by using ``git clone git@github.com:lammps/lammps --depth=1 --branch=develop``.
  This reduces the downloaded size to about half.  With ``--depth=1`` it is not possible to check out different
  versions/branches of LAMMPS, using ``--depth=1000`` will make multiple recent versions available at little
  extra storage needs (the entire git history had nearly 30,000 commits in fall 2021).
--- a/doc/src/Build_manual.rst
+++ b/doc/src/Build_manual.rst
@ -33,12 +33,15 @@ various tools and files.  Some of them have to be installed (see below).  For
 the rest the build process will attempt to download and install them into
 a python virtual environment and local folders.

-A current version of the manual (latest patch release, aka unstable
-branch) is is available online at:
-`https://docs.lammps.org/Manual.html <https://docs.lammps.org/Manual.html>`_.
-A version of the manual corresponding to the ongoing development (aka master branch)
-is available online at: `https://docs.lammps.org/latest/
-<https://docs.lammps.org/latest/>`_
+A current version of the manual (latest patch release, that is the state
+of the *release* branch) is is available online at:
+`https://docs.lammps.org/ <https://docs.lammps.org/>`_.
+A version of the manual corresponding to the ongoing development (that is
+the state of the *develop* branch) is available online at:
+`https://docs.lammps.org/latest/ <https://docs.lammps.org/latest/>`_
+A version of the manual corresponding to the latest stable LAMMPS release
+(that is the state of the *stable* branch) is available online at:
+`https://docs.lammps.org/stable/ <https://docs.lammps.org/stable/>`_

 Build using GNU make
 --------------------
--- a/doc/src/Build_settings.rst
+++ b/doc/src/Build_settings.rst
@ -71,7 +71,8 @@ LAMMPS can use them if they are available on your system.

         -D FFTW3_INCLUDE_DIR=path   # path to FFTW3 include files
         -D FFTW3_LIBRARY=path       # path to FFTW3 libraries
-         -D FFT_FFTW_THREADS=on      # enable using threaded FFTW3 libraries
+         -D FFTW3_OMP_LIBRARY=path   # path to FFTW3 OpenMP wrapper libraries
+         -D FFT_FFTW_THREADS=on      # enable using OpenMP threaded FFTW3 libraries
         -D MKL_INCLUDE_DIR=path     # ditto for Intel MKL library
         -D FFT_MKL_THREADS=on       # enable using threaded FFTs with MKL libraries
         -D MKL_LIBRARY=path         # path to MKL libraries
--- a/doc/src/Developer.rst
+++ b/doc/src/Developer.rst
@ -11,6 +11,7 @@ of time and requests from the LAMMPS user community.
   :maxdepth: 1

   Developer_org
+   Developer_parallel
   Developer_flow
   Developer_write
   Developer_notes
--- a/doc/src/Developer_par_comm.rst
+++ b/doc/src/Developer_par_comm.rst
@ -0,0 +1,120 @@
+Communication
+^^^^^^^^^^^^^
+
+Following the partitioning scheme in use all per-atom data is
+distributed across the MPI processes, which allows LAMMPS to handle very
+large systems provided it uses a correspondingly large number of MPI
+processes.  Since The per-atom data (atom IDs, positions, velocities,
+types, etc.)  To be able to compute the short-range interactions MPI
+processes need not only access to data of atoms they "own" but also
+information about atoms from neighboring sub-domains, in LAMMPS referred
+to as "ghost" atoms.  These are copies of atoms storing required
+per-atom data for up to the communication cutoff distance. The green
+dashed-line boxes in the :ref:`domain-decomposition` figure illustrate
+the extended ghost-atom sub-domain for one processor.
+
+This approach is also used to implement periodic boundary
+conditions: atoms that lie within the cutoff distance across a periodic
+boundary are also stored as ghost atoms and taken from the periodic
+replication of the sub-domain, which may be the same sub-domain, e.g. if
+running in serial.  As a consequence of this, force computation in
+LAMMPS is not subject to minimum image conventions and thus cutoffs may
+be larger than half the simulation domain.
+
+.. _ghost-atom-comm:
+.. figure:: img/ghost-comm.png
+   :align: center
+
+   ghost atom communication
+
+   This figure shows the ghost atom communication patterns between
+   sub-domains for "brick" (left) and "tiled" communication styles for
+   2d simulations.  The numbers indicate MPI process ranks.  Here the
+   sub-domains are drawn spatially separated for clarity.  The
+   dashed-line box is the extended sub-domain of processor 0 which
+   includes its ghost atoms.  The red- and blue-shaded boxes are the
+   regions of communicated ghost atoms.
+
+Efficient communication patterns are needed to update the "ghost" atom
+data, since that needs to be done at every MD time step or minimization
+step.  The diagrams of the `ghost-atom-comm` figure illustrate how ghost
+atom communication is performed in two stages for a 2d simulation (three
+in 3d) for both a regular and irregular partitioning of the simulation
+box.  For the regular case (left) atoms are exchanged first in the
+*x*-direction, then in *y*, with four neighbors in the grid of processor
+sub-domains.
+
+In the *x* stage, processor ranks 1 and 2 send owned atoms in their
+red-shaded regions to rank 0 (and vice versa).  Then in the *y* stage,
+ranks 3 and 4 send atoms in their blue-shaded regions to rank 0, which
+includes ghost atoms they received in the *x* stage.  Rank 0 thus
+acquires all its ghost atoms; atoms in the solid blue corner regions
+are communicated twice before rank 0 receives them.
+
+For the irregular case (right) the two stages are similar, but a
+processor can have more than one neighbor in each direction.  In the
+*x* stage, MPI ranks 1,2,3 send owned atoms in their red-shaded regions to
+rank 0 (and vice versa).  These include only atoms between the lower
+and upper *y*-boundary of rank 0's sub-domain.  In the *y* stage, ranks
+4,5,6 send atoms in their blue-shaded regions to rank 0.  This may
+include ghost atoms they received in the *x* stage, but only if they
+are needed by rank 0 to fill its extended ghost atom regions in the
+/-*y* directions (blue rectangles).  Thus in this case, ranks 5 and
+6 do not include ghost atoms they received from each other (in the *x*
+stage) in the atoms they send to rank 0.  The key point is that while
+the pattern of communication is more complex in the irregular
+partitioning case, it can still proceed in two stages (three in 3d)
+via atom exchanges with only neighboring processors.
+
+When attributes of owned atoms are sent to neighboring processors to
+become attributes of their ghost atoms, LAMMPS calls this a "forward"
+communication.  On timesteps when atoms migrate to new owning processors
+and neighbor lists are rebuilt, each processor creates a list of its
+owned atoms which are ghost atoms in each of its neighbor processors.
+These lists are used to pack per-atom coordinates (for example) into
+message buffers in subsequent steps until the next reneighboring.
+
+A "reverse" communication is when computed ghost atom attributes are
+sent back to the processor who owns the atom.  This is used (for
+example) to sum partial forces on ghost atoms to the complete force on
+owned atoms.  The order of the two stages described in the
+:ref:`ghost-atom-comm` figure is inverted and the same lists of atoms
+are used to pack and unpack message buffers with per-atom forces.  When
+a received buffer is unpacked, the ghost forces are summed to owned atom
+forces.  As in forward communication, forces on atoms in the four blue
+corners of the diagrams are sent, received, and summed twice (once at
+each stage) before owning processors have the full force.
+
+These two operations are used many places within LAMMPS aside from
+exchange of coordinates and forces, for example by manybody potentials
+to share intermediate per-atom values, or by rigid-body integrators to
+enable each atom in a body to access body properties.  Here are
+additional details about how these communication operations are
+performed in LAMMPS:
+
+- When exchanging data with different processors, forward and reverse
+  communication is done using ``MPI_Send()`` and ``MPI_IRecv()`` calls.
+  If a processor is "exchanging" atoms with itself, only the pack and
+  unpack operations are performed, e.g. to create ghost atoms across
+  periodic boundaries when running on a single processor.
+
+- For forward communication of owned atom coordinates, periodic box
+  lengths are added and subtracted when the receiving processor is
+  across a periodic boundary from the sender.  There is then no need to
+  apply a minimum image convention when calculating distances between
+  atom pairs when building neighbor lists or computing forces.
+
+- The cutoff distance for exchanging ghost atoms is typically equal to
+  the neighbor cutoff.  But it can also chosen to be longer if needed,
+  e.g. half the diameter of a rigid body composed of multiple atoms or
+  over 3x the length of a stretched bond for dihedral interactions.  It
+  can also exceed the periodic box size.  For the regular communication
+  pattern (left), if the cutoff distance extends beyond a neighbor
+  processor's sub-domain, then multiple exchanges are performed in the
+  same direction.  Each exchange is with the same neighbor processor,
+  but buffers are packed/unpacked using a different list of atoms. For
+  forward communication, in the first exchange a processor sends only
+  owned atoms.  In subsequent exchanges, it sends ghost atoms received
+  in previous exchanges.  For the irregular pattern (right) overlaps of
+  a processor's extended ghost-atom sub-domain with all other processors
+  in each dimension are detected.
--- a/doc/src/Developer_par_long.rst
+++ b/doc/src/Developer_par_long.rst
@ -0,0 +1,188 @@
+Long-range interactions
+^^^^^^^^^^^^^^^^^^^^^^^
+
+For charged systems, LAMMPS can compute long-range Coulombic
+interactions via the FFT-based particle-particle/particle-mesh (PPPM)
+method implemented in :doc:`kspace style pppm and its variants
+<kspace_style>`.  For that Coulombic interactions are partitioned into
+short- and long-range components.  The short-ranged portion is computed
+in real space as a loop over pairs of charges within a cutoff distance,
+using neighbor lists.  The long-range portion is computed in reciprocal
+space using a kspace style.  For the PPPM implementation the simulation
+cell is overlaid with a regular FFT grid in 3d. It proceeds in several stages:
+
+a) each atom's point charge is interpolated to nearby FFT grid points,
+b) a forward 3d FFT is performed,
+c) a convolution operation is performed in reciprocal space,
+d) one or more inverse 3d FFTs are performed, and
+e) electric field values from grid points near each atom are interpolated to compute
+   its forces.
+
+For any of the spatial-decomposition partitioning schemes each processor
+owns the brick-shaped portion of FFT grid points contained within its
+sub-domain.  The two interpolation operations use a stencil of grid
+points surrounding each atom.  To accommodate the stencil size, each
+processor also stores a few layers of ghost grid points surrounding its
+brick.  Forward and reverse communication of grid point values is
+performed similar to the corresponding :doc:`atom data communication
+<Developer_par_comm>`.  In this case, electric field values on owned
+grid points are sent to neighboring processors to become ghost point
+values.  Likewise charge values on ghost points are sent and summed to
+values on owned points.
+
+For triclinic simulation boxes, the FFT grid planes are parallel to
+the box faces, but the mapping of charge and electric field values
+to/from grid points is done in reduced coordinates where the tilted
+box is conceptually a unit cube, so that the stencil and FFT
+operations are unchanged.  However the FFT grid size required for a
+given accuracy is larger for triclinic domains than it is for
+orthogonal boxes.
+
+.. _fft-parallel:
+.. figure:: img/fft-decomp-parallel.png
+   :align: center
+
+   parallel FFT in PPPM
+
+   Stages of a parallel FFT for a simulation domain overlaid
+   with an 8x8x8 3d FFT grid, partitioned across 64 processors.
+   Within each of the 4 diagrams, grid cells of the same color are
+   owned by a single processor; for simplicity only cells owned by 4
+   or 8 of the 64 processors are colored.  The two images on the left
+   illustrate brick-to-pencil communication.  The two images on the
+   right illustrate pencil-to-pencil communication, which in this
+   case transposes the *y* and *z* dimensions of the grid.
+
+Parallel 3d FFTs require substantial communication relative to their
+computational cost.  A 3d FFT is implemented by a series of 1d FFTs
+along the *x-*, *y-*, and *z-*\ direction of the FFT grid.  Thus the FFT
+grid cannot be decomposed like atoms into 3 dimensions for parallel
+processing of the FFTs but only in 1 (as planes) or 2 (as pencils)
+dimensions and in between the steps the grid needs to be transposed to
+have the FFT grid portion "owned" by each MPI process complete in the
+direction of the 1d FFTs it has to perform. LAMMPS uses the
+pencil-decomposition algorithm as shown in the :ref:`fft-parallel` figure.
+
+Initially (far left), each processor owns a brick of same-color grid
+cells (actually grid points) contained within in its sub-domain.  A
+brick-to-pencil communication operation converts this layout to 1d
+pencils in the *x*-dimension (center left).  Again, cells of the same
+color are owned by the same processor.  Each processor can then compute
+a 1d FFT on each pencil of data it wholly owns using a call to the
+configured FFT library.  A pencil-to-pencil communication then converts
+this layout to pencils in the *y* dimension (center right) which
+effectively transposes the *x* and *y* dimensions of the grid, followed
+by 1d FFTs in *y*.  A final transpose of pencils from *y* to *z* (far
+right) followed by 1d FFTs in *z* completes the forward FFT.  The data
+is left in a *z*-pencil layout for the convolution operation.  One or
+more inverse FFTs then perform the sequence of 1d FFTs and communication
+steps in reverse order; the final layout of resulting grid values is the
+same as the initial brick layout.
+
+Each communication operation within the FFT (brick-to-pencil or
+pencil-to-pencil or pencil-to-brick) converts one tiling of the 3d grid
+to another, where a tiling in this context means an assignment of a
+small brick-shaped subset of grid points to each processor, the union of
+which comprise the entire grid.  The parallel `fftMPI library
+<https://lammps.github.io/fftmpi/>`_ written for LAMMPS allows arbitrary
+definitions of the tiling so that an irregular partitioning of the
+simulation domain can use it directly.  Transforming data from one
+tiling to another is implemented in `fftMPI` using point-to-point
+communication, where each processor sends data to a few other
+processors, since each tile in the initial tiling overlaps with a
+handful of tiles in the final tiling.
+
+The transformations could also be done using collective communication
+across all $P$ processors with a single call to ``MPI_Alltoall()``, but
+this is typically much slower.  However, for the specialized brick and
+pencil tiling illustrated in :ref:`fft-parallel` figure, collective
+communication across the entire MPI communicator is not required.  In
+the example an :math:`8^3` grid with 512 grid cells is partitioned
+across 64 processors; each processor owns a 2x2x2 3d brick of grid
+cells.  The initial brick-to-pencil communication (upper left to upper
+right) only requires collective communication within subgroups of 4
+processors, as illustrated by the 4 colors.  More generally, a
+brick-to-pencil communication can be performed by partitioning *P*
+processors into :math:`P^{\frac{2}{3}}` subgroups of
+:math:`P^{\frac{1}{3}}` processors each.  Each subgroup performs
+collective communication only within its subgroup.  Similarly,
+pencil-to-pencil communication can be performed by partitioning *P*
+processors into :math:`P^{\frac{1}{2}}` subgroups of
+:math:`P^{\frac{1}{2}}` processors each.  This is illustrated in the
+figure for the :math:`y \Rightarrow z` communication (center).  An
+eight-processor subgroup owns the front *yz* plane of data and performs
+collective communication within the subgroup to transpose from a
+*y*-pencil to *z*-pencil layout.
+
+LAMMPS invokes point-to-point communication by default, but also
+provides the option of partitioned collective communication when using a
+:doc:`kspace_modify collective yes <kspace_modify>` command to switch to
+that mode.  In the latter case, the code detects the size of the
+disjoint subgroups and partitions the single *P*-size communicator into
+multiple smaller communicators, each of which invokes collective
+communication.  Testing on a large IBM Blue Gene/Q machine at Argonne
+National Labs showed a significant improvement in FFT performance for
+large processor counts; partitioned collective communication was faster
+than point-to-point communication or global collective communication
+involving all *P* processors.
+
+Here are some additional details about FFTs for long-range and related
+grid/particle operations that LAMMPS supports:
+
+- The fftMPI library allows each grid dimension to be a multiple of
+  small prime factors (2,3,5), and allows any number of processors to
+  perform the FFT.  The resulting brick and pencil decompositions are
+  thus not always as well-aligned but the size of subgroups of
+  processors for the two modes of communication (brick/pencil and
+  pencil/pencil) still scale as :math:`O(P^{\frac{1}{3}})` and
+  :math:`O(P^{\frac{1}{2}})`.
+
+- For efficiency in performing 1d FFTs, the grid transpose
+  operations illustrated in Figure \ref{fig:fft} also involve
+  reordering the 3d data so that a different dimension is contiguous
+  in memory.  This reordering can be done during the packing or
+  unpacking of buffers for MPI communication.
+
+- For large systems and particularly a large number of MPI processes,
+  the dominant cost for parallel FFTs is often the communication, not
+  the computation of 1d FFTs, even though the latter scales as :math:`N
+  \log(N)` in the number of grid points *N* per grid direction.  This is
+  due to the fact that only a 2d decomposition into pencils is possible
+  while atom data (and their corresponding short-range force and energy
+  computations) can be decomposed efficiently in 3d.
+
+  This can be addressed by reducing the number of MPI processes involved
+  in the MPI communication by using :doc:`hybrid MPI + OpenMP
+  parallelization <Speed_omp>`.  This will use OpenMP parallelization
+  inside the MPI domains and while that may have a lower parallel
+  efficiency, it reduces the communication overhead.
+
+  As an alternative it is also possible to start a :ref:`multi-partition
+  <partition>` calculation and then use the :doc:`verlet/split
+  integrator <run_style>` to perform the PPPM computation on a
+  dedicated, separate partition of MPI processes.  This uses an integer
+  "1:*p*" mapping of *p* sub-domains of the atom decomposition to one
+  sub-domain of the FFT grid decomposition and where pairwise non-bonded
+  and bonded forces and energies are computed on the larger partition
+  and the PPPM kspace computation concurrently on the smaller partition.
+
+- LAMMPS also implements PPPM-based solvers for other long-range
+  interactions, dipole and dispersion (Lennard-Jones), which can be used
+  in conjunction with long-range  Coulombics for point charges.
+
+- LAMMPS implements a ``GridComm`` class which overlays the simulation
+  domain with a regular grid, partitions it across processors in a
+  manner consistent with processor sub-domains, and provides methods for
+  forward and reverse communication of owned and ghost grid point
+  values.  It is used for PPPM as an FFT grid (as outlined above) and
+  also for the MSM algorithm which uses a cascade of grid sizes from
+  fine to coarse to compute long-range Coulombic forces.  The GridComm
+  class is also useful for models where continuum fields interact with
+  particles.  For example, the two-temperature model (TTM) defines heat
+  transfer between atoms (particles) and electrons (continuum gas) where
+  spatial variations in the electron temperature are computed by finite
+  differences of a discretized heat equation on a regular grid.  The
+  :doc:`fix ttm/grid <fix_ttm>` command uses the ``GridComm`` class
+  internally to perform its grid operations on a distributed grid
+  instead of the original :doc:`fix ttm <fix_ttm>` which uses a
+  replicated grid.
--- a/doc/src/Developer_par_neigh.rst
+++ b/doc/src/Developer_par_neigh.rst
@ -0,0 +1,159 @@
+Neighbor lists
+^^^^^^^^^^^^^^
+
+To compute forces efficiently, each processor creates a Verlet-style
+neighbor list which enumerates all pairs of atoms *i,j* (*i* = owned,
+*j* = owned or ghost) with separation less than the applicable
+neighbor list cutoff distance.  In LAMMPS the neighbor lists are stored
+in a multiple-page data structure; each page is a contiguous chunk of
+memory which stores vectors of neighbor atoms *j* for many *i* atoms.
+This allows pages to be incrementally allocated or deallocated in blocks
+as needed.  Neighbor lists typically consume the most memory of any data
+structure in LAMMPS.  The neighbor list is rebuilt (from scratch) once
+every few timesteps, then used repeatedly each step for force or other
+computations.  The neighbor cutoff distance is :math:`R_n = R_f +
+\Delta_s`, where :math:`R_f` is the (largest) force cutoff defined by
+the interatomic potential for computing short-range pairwise or manybody
+forces and :math:`\Delta_s` is a "skin" distance that allows the list to
+be used for multiple steps assuming that atoms do not move very far
+between consecutive time steps.  Typically the code triggers
+reneighboring when any atom has moved half the skin distance since the
+last reneighboring; this and other options of the neighbor list rebuild
+can be adjusted with the :doc:`neigh_modify <neigh_modify>` command.
+
+On steps when reneighboring is performed, atoms which have moved outside
+their owning processor's sub-domain are first migrated to new processors
+via communication.  Periodic boundary conditions are also (only)
+enforced on these steps to ensure each atom is re-assigned to the
+correct processor.  After migration, the atoms owned by each processor
+are stored in a contiguous vector.  Periodically each processor
+spatially sorts owned atoms within its vector to reorder it for improved
+cache efficiency in force computations and neighbor list building.  For
+that atoms are spatially binned and then reordered so that atoms in the
+same bin are adjacent in the vector.  Atom sorting can be disabled or
+its settings modified with the :doc:`atom_modify <atom_modify>` command.
+
+.. _neighbor-stencil:
+.. figure:: img/neigh-stencil.png
+   :align: center
+
+   neighbor list stencils
+
+   A 2d simulation sub-domain (thick black line) and the corresponding
+   ghost atom cutoff region (dashed blue line) for both orthogonal
+   (left) and triclinic (right) domains.  A regular grid of neighbor
+   bins (thin lines) overlays the entire simulation domain and need not
+   align with sub-domain boundaries; only the portion overlapping the
+   augmented sub-domain is shown.  In the triclinic case it overlaps the
+   bounding box of the tilted rectangle.  The blue- and red-shaded bins
+   represent a stencil of bins searched to find neighbors of a particular
+   atom (black dot).
+
+To build a local neighbor list in linear time, the simulation domain is
+overlaid (conceptually) with a regular 3d (or 2d) grid of neighbor bins,
+as shown in the :ref:`neighbor-stencil` figure for 2d models and a
+single MPI processor's sub-domain.  Each processor stores a set of
+neighbor bins which overlap its sub-domain extended by the neighbor
+cutoff distance :math:`R_n`.  As illustrated, the bins need not align
+with processor boundaries; an integer number in each dimension is fit to
+the size of the entire simulation box.
+
+Most often LAMMPS builds what it calls a "half" neighbor list where
+each *i,j* neighbor pair is stored only once, with either atom *i* or
+*j* as the central atom.  The build can be done efficiently by using a
+pre-computed "stencil" of bins around a central origin bin which
+contains the atom whose neighbors are being searched for.  A stencil
+is simply a list of integer offsets in *x,y,z* of nearby bins
+surrounding the origin bin which are close enough to contain any
+neighbor atom *j* within a distance :math:`R_n` from any atom *i* in the
+origin bin.  Note that for a half neighbor list, the stencil can be
+asymmetric since each atom only need store half its nearby neighbors.
+
+These stencils are illustrated in the figure for a half list and a bin
+size of :math:`\frac{1}{2} R_n`.  There are 13 red+blue stencil bins in
+2d (for the orthogonal case, 15 for triclinic).  In 3d there would be
+63, 13 in the plane of bins that contain the origin bin and 25 in each
+of the two planes above it in the *z* direction (75 for triclinic).  The
+reason the triclinic stencil has extra bins is because the bins tile the
+bounding box of the entire triclinic domain and thus are not periodic
+with respect to the simulation box itself.  The stencil and logic for
+determining which *i,j* pairs to include in the neighbor list are
+altered slightly to account for this.
+
+To build a neighbor list, a processor first loops over its "owned" plus
+"ghost" atoms and assigns each to a neighbor bin.  This uses an integer
+vector to create a linked list of atom indices within each bin.  It then
+performs a triply-nested loop over its owned atoms *i*, the stencil of
+bins surrounding atom *i*'s bin, and the *j* atoms in each stencil bin
+(including ghost atoms).  If the distance :math:`r_{ij} < R_n`, then
+atom *j* is added to the vector of atom *i*'s neighbors.
+
+Here are additional details about neighbor list build options LAMMPS
+supports:
+
+- The choice of bin size is an option; a size half of :math:`R_n` has
+  been found to be optimal for many typical cases.  Smaller bins incur
+  additional overhead to loop over; larger bins require more distance
+  calculations.  Note that for smaller bin sizes, the 2d stencil in the
+  figure would be more semi-circular in shape (hemispherical in 3d),
+  with bins near the corners of the square eliminated due to their
+  distance from the origin bin.
+
+- Depending on the interatomic potential(s) and other commands used in
+  an input script, multiple neighbor lists and stencils with different
+  attributes may be needed.  This includes lists with different cutoff
+  distances, e.g. for force computation versus occasional diagnostic
+  computations such as a radial distribution function, or for the
+  r-RESPA time integrator which can partition pairwise forces by
+  distance into subsets computed at different time intervals.  It
+  includes "full" lists (as opposed to half lists) where each *i,j* pair
+  appears twice, stored once with *i* and *j*, and which use a larger
+  symmetric stencil.  It also includes lists with partial enumeration of
+  ghost atom neighbors.  The full and ghost-atom lists are used by
+  various manybody interatomic potentials.  Lists may also use different
+  criteria for inclusion of a pair interaction.  Typically this simply
+  depends only on the distance between two atoms and the cutoff
+  distance.  But for finite-size coarse-grained particles with
+  individual diameters (e.g. polydisperse granular particles), it can
+  also depend on the diameters of the two particles.
+
+- When using :doc:`pair style hybrid <pair_hybrid>` multiple sub-lists
+  of the master neighbor list for the full system need to be generated,
+  one for each sub-style, which contains only the *i,j* pairs needed to
+  compute interactions between subsets of atoms for the corresponding
+  potential.  This means not all *i* or *j* atoms owned by a processor
+  are included in a particular sub-list.
+
+- Some models use different cutoff lengths for pairwise interactions
+  between different kinds of particles which are stored in a single
+  neighbor list.  One example is a solvated colloidal system with large
+  colloidal particles where colloid/colloid, colloid/solvent, and
+  solvent/solvent interaction cutoffs can be dramatically different.
+  Another is a model of polydisperse finite-size granular particles;
+  pairs of particles interact only when they are in contact with each
+  other.  Mixtures with particle size ratios as high as 10-100x may be
+  used to model realistic systems.  Efficient neighbor list building
+  algorithms for these kinds of systems are available in LAMMPS.  They
+  include a method which uses different stencils for different cutoff
+  lengths and trims the stencil to only include bins that straddle the
+  cutoff sphere surface.  More recently a method which uses both
+  multiple stencils and multiple bin sizes was developed; it builds
+  neighbor lists efficiently for systems with particles of any size
+  ratio, though other considerations (timestep size, force computations)
+  may limit the ability to model systems with huge polydispersity.
+
+- For small and sparse systems and as a fallback method, LAMMPS also
+  supports neighbor list construction without binning by using a full
+  :math:`O(N^2)` loop over all *i,j* atom pairs in a sub-domain when
+  using the :doc:`neighbor nsq <neighbor>` command.
+
+- Dependent on the "pair" setting of the :doc:`newton <newton>` command,
+  the "half" neighbor lists may contain **all** pairs of atoms where
+  atom *j* is a ghost atom (i.e. when the newton pair setting is *off*)
+  For the newton pair *on* setting the atom *j* is only added to the
+  list if its *z* coordinate is larger, or if equal the *y* coordinate
+  is larger, and that is equal, too, the *x* coordinate is larger.  For
+  homogeneously dense systems that will result in picking neighbors from
+  a same size sector in always the same direction relative to the
+  "owned" atom and thus it should lead to similar length neighbor lists
+  and thus reduce the chance of a load imbalance.
--- a/doc/src/Developer_par_openmp.rst
+++ b/doc/src/Developer_par_openmp.rst
@ -0,0 +1,114 @@
+OpenMP Parallelism
+^^^^^^^^^^^^^^^^^^
+
+The styles in the INTEL, KOKKOS, and OPENMP package offer to use OpenMP
+thread parallelism to predominantly distribute loops over local data
+and thus follow an orthogonal parallelization strategy to the
+decomposition into spatial domains used by the :doc:`MPI partitioning
+<Developer_par_part>`.  For clarity, this section discusses only the
+implementation in the OPENMP package as it is the simplest. The INTEL
+and KOKKOS package offer additional options and are more complex since
+they support more features and different hardware like co-processors
+or GPUs.
+
+One of the key decisions when implementing the OPENMP package was to
+keep the changes to the source code small, so that it would be easier to
+maintain the code and keep it in sync with the non-threaded standard
+implementation.  this is achieved by a) making the OPENMP version a
+derived class from the regular version (e.g. ``PairLJCutOMP`` from
+``PairLJCut``) and overriding only methods that are multi-threaded or
+need to be modified to support multi-threading (similar to what was done
+in the OPT package), b) keeping the structure in the modified code very
+similar so that side-by-side comparisons are still useful, and c)
+offloading additional functionality and multi-thread support functions
+into three separate classes ``ThrOMP``, ``ThrData``, and ``FixOMP``.
+``ThrOMP`` provides additional, multi-thread aware functionality not
+available in the corresponding base class (e.g. ``Pair`` for
+``PairLJCutOMP``) like multi-thread aware variants of the "tally"
+functions. Those functions are made available through multiple
+inheritance so those new functions have to have unique names to avoid
+ambiguities; typically ``_thr`` is appended to the name of the function.
+``ThrData`` is a classes that manages per-thread data structures.
+It is used instead of extending the corresponding storage to per-thread
+arrays to avoid slowdowns due to "false sharing" when multiple threads
+update adjacent elements in an array and thus force the CPU cache lines
+to be reset and re-fetched.  ``FixOMP`` finally manages the "multi-thread
+state" like settings and access to per-thread storage, it is activated
+by the :doc:`package omp <package>` command.
+
+Avoiding data races
+"""""""""""""""""""
+
+A key problem when implementing thread parallelism in an MD code is
+to avoid data races when updating accumulated properties like forces,
+energies, and stresses.  When interactions are computed, they always
+involve multiple atoms and thus there are race conditions when multiple
+threads want to update per-atom data of the same atoms.  Five possible
+strategies have been considered to avoid this:
+
+1) restructure the code so that there is no overlapping access possible
+   when computing in parallel, e.g. by breaking lists into multiple
+   parts and synchronizing threads in between.
+2) have each thread be "responsible" for a specific group of atoms and
+   compute these interactions multiple times, once on each thread that
+   is responsible for a given atom and then have each thread only update
+   the properties of this atom.
+3) use mutexes around functions and regions of code where the data race
+   could happen
+4) use atomic operations when updating per-atom properties
+5) use replicated per-thread data structures to accumulate data without
+   conflicts and then use a reduction to combine those results into the
+   data structures used by the regular style.
+
+Option 5 was chosen for the OPENMP package because it would retain the
+performance for the case of 1 thread and the code would be more
+maintainable.  Option 1 would require extensive code changes,
+particularly to the neighbor list code; options 2 would have incurred a
+2x or more performance penalty for the serial case; option 3 causes
+significant overhead and would enforce serialization of operations in
+inner loops and thus defeat the purpose of multi-threading; option 4
+slows down the serial case although not quite as bad as option 2.  The
+downside of option 5 is that the overhead of the reduction operations
+grows with the number of threads used, so there would be a crossover
+point where options 2 or 4 would result in faster executing.  That is
+why option 2 for example is used in the GPU package because a GPU is a
+processor with a massive number of threads.  However, since the MPI
+parallelization is generally more effective for typical MD systems, the
+expectation is that thread parallelism is only used for a smaller number
+of threads (2-8).  At the time of its implementation, that number was
+equivalent to the number of CPU cores per CPU socket on high-end
+supercomputers.
+
+Thus arrays like the force array are dimensioned to the number of atoms
+times the number of threads when enabling OpenMP support and inside the
+compute functions a pointer to a different chunk is obtained by each thread.
+Similarly, accumulators like potential energy or virial are kept in
+per-thread instances of the ``ThrData`` class and then only reduced and
+stored in their global counterparts at the end of the force computation.
+
+
+Loop scheduling
+"""""""""""""""
+
+Multi-thread parallelization is applied by distributing (outer) loops
+statically across threads.  Typically this would be the loop over local
+atoms *i* when processing *i,j* pairs of atoms from a neighbor list.
+The design of the neighbor list code results in atoms having a similar
+number of neighbors for homogeneous systems and thus load imbalances
+across threads are not common and typically happen for systems where
+also the MPI parallelization would be unbalanced, which would typically
+have a more pronounced impact on the performance.  This same loop
+scheduling scheme can also be applied to the reduction operations on
+per-atom data to try and reduce the overhead of the reduction operation.
+
+Neighbor list parallelization
+"""""""""""""""""""""""""""""
+
+In addition to the parallelization of force computations, also the
+generation of the neighbor lists is parallelized.  As explained
+previously, neighbor lists are built by looping over "owned" atoms and
+storing the neighbors in "pages".  In the OPENMP variants of the
+neighbor list code, each thread operates on a different chunk of "owned"
+atoms and allocates and fills its own set of pages with neighbor list
+data.  This is achieved by each thread keeping its own instance of the
+:cpp:class:`MyPage <LAMMPS_NS::MyPage>` page allocator class.
--- a/doc/src/Developer_par_part.rst
+++ b/doc/src/Developer_par_part.rst
@ -0,0 +1,89 @@
+Partitioning
+^^^^^^^^^^^^
+
+The underlying spatial decomposition strategy used by LAMMPS for
+distributed-memory parallelism is set with the :doc:`comm_style command
+<comm_style>` and can be either "brick" (a regular grid) or "tiled".
+
+.. _domain-decomposition:
+.. figure:: img/domain-decomp.png
+   :align: center
+
+   domain decomposition
+
+   This figure shows the different kinds of domain decomposition used
+   for MPI parallelization: "brick" on the left with an orthogonal
+   (left) and a triclinic (middle) simulation domain, and a "tiled"
+   decomposition (right).  The black lines show the division into
+   sub-domains and the contained atoms are "owned" by the corresponding
+   MPI process. The green dashed lines indicate how sub-domains are
+   extended with "ghost" atoms up to the communication cutoff distance.
+
+The LAMMPS simulation box is a 3d or 2d volume, which can be orthogonal
+or triclinic in shape, as illustrated in the :ref:`domain-decomposition`
+figure for the 2d case.  Orthogonal means the box edges are aligned with
+the *x*, *y*, *z* Cartesian axes, and the box faces are thus all
+rectangular.  Triclinic allows for a more general parallelepiped shape
+in which edges are aligned with three arbitrary vectors and the box
+faces are parallelograms.  In each dimension box faces can be periodic,
+or non-periodic with fixed or shrink-wrapped boundaries.  In the fixed
+case, atoms which move outside the face are deleted; shrink-wrapped
+means the position of the box face adjusts continuously to enclose all
+the atoms.
+
+For distributed-memory MPI parallelism, the simulation box is spatially
+decomposed (partitioned) into non-overlapping sub-domains which fill the
+box. The default partitioning, "brick", is most suitable when atom
+density is roughly uniform, as shown in the left-side images of the
+:ref:`domain-decomposition` figure.  The sub-domains comprise a regular
+grid and all sub-domains are identical in size and shape.  Both the
+orthogonal and triclinic boxes can deform continuously during a
+simulation, e.g. to compress a solid or shear a liquid, in which case
+the processor sub-domains likewise deform.
+
+
+For models with non-uniform density, the number of particles per
+processor can be load-imbalanced with the default partitioning.  This
+reduces parallel efficiency, as the overall simulation rate is limited
+by the slowest processor, i.e. the one with the largest computational
+load.  For such models, LAMMPS supports multiple strategies to reduce
+the load imbalance:
+
+- The processor grid decomposition is by default based on the simulation
+  cell volume and tries to optimize the volume to surface ratio for the sub-domains.
+  This can be changed with the :doc:`processors command <processors>`.
+- The parallel planes defining the size of the sub-domains can be shifted
+  with the :doc:`balance command <balance>`. Which can be done in addition
+  to choosing a more optimal processor grid.
+- The recursive bisectioning algorithm in combination with the "tiled"
+  communication style can produce a partitioning with equal numbers of
+  particles in each sub-domain.
+
+
+.. |decomp1| image:: img/decomp-regular.png
+   :width: 24%
+
+.. |decomp2| image:: img/decomp-processors.png
+   :width: 24%
+
+.. |decomp3| image:: img/decomp-balance.png
+   :width: 24%
+
+.. |decomp4| image:: img/decomp-rcb.png
+   :width: 24%
+
+|decomp1|  |decomp2|  |decomp3|  |decomp4|
+
+The pictures above demonstrate different decompositions for a 2d system
+with 12 MPI ranks.  The atom colors indicate the load imbalance of each
+sub-domain with green being optimal and red the least optimal.
+
+Due to the vacuum in the system, the default decomposition is unbalanced
+with several MPI ranks without atoms (left). By forcing a 1x12x1
+processor grid, every MPI rank does computations now, but number of
+atoms per sub-domain is still uneven and the thin slice shape increases
+the amount of communication between sub-domains (center left). With a
+2x6x1 processor grid and shifting the sub-domain divisions, the load
+imbalance is further reduced and the amount of communication required
+between sub-domains is less (center right).  And using the recursive
+bisectioning leads to further improved decomposition (right).
--- a/doc/src/Developer_parallel.rst
+++ b/doc/src/Developer_parallel.rst
@ -0,0 +1,28 @@
+Parallel algorithms
+-------------------
+
+LAMMPS is designed to enable running simulations in parallel using the
+MPI parallel communication standard with distributed data via domain
+decomposition.  The parallelization aims to be efficient result in good
+strong scaling (= good speedup for the same system) and good weak
+scaling (= the computational cost of enlarging the system is
+proportional to the system size).  Additional parallelization using GPUs
+or OpenMP can also be applied within the sub-domain assigned to an MPI
+process.  For clarity, most of the following illustrations show the 2d
+simulation case. The underlying algorithms in those cases, however,
+apply to both 2d and 3d cases equally well.
+
+.. note::
+
+   The text and most of the figures in this chapter were adapted
+   for the manual from the section on parallel algorithms in the
+   :ref:`new LAMMPS paper <lammps_paper>`.
+
+.. toctree::
+   :maxdepth: 1
+
+   Developer_par_part
+   Developer_par_comm
+   Developer_par_neigh
+   Developer_par_long
+   Developer_par_openmp
--- a/doc/src/Developer_write.rst
+++ b/doc/src/Developer_write.rst
@ -53,7 +53,7 @@ of each timestep. First of all, implement a constructor:
     if (narg < 4)
       error->all(FLERR,"Illegal fix print/vel command");

-     nevery = force->inumeric(FLERR,arg[3]);
+     nevery = utils::inumeric(FLERR,arg[3],false,lmp);
     if (nevery <= 0)
       error->all(FLERR,"Illegal fix print/vel command");
   }
--- a/doc/src/Howto_github.rst
+++ b/doc/src/Howto_github.rst
@ -7,11 +7,11 @@ LAMMPS GitHub tutorial

 This document describes the process of how to use GitHub to integrate
 changes or additions you have made to LAMMPS into the official LAMMPS
-distribution.  It uses the process of updating this very tutorial as
-an example to describe the individual steps and options.  You need to
-be familiar with git and you may want to have a look at the
-`git book <http://git-scm.com/book/>`_ to reacquaint yourself with some
-of the more advanced git features used below.
+distribution.  It uses the process of updating this very tutorial as an
+example to describe the individual steps and options.  You need to be
+familiar with git and you may want to have a look at the `git book
+<http://git-scm.com/book/>`_ to familiarize yourself with some of the
+more advanced git features used below.

 As of fall 2016, submitting contributions to LAMMPS via pull requests
 on GitHub is the preferred option for integrating contributed features
@ -37,15 +37,15 @@ username or e-mail address and password.
 **Forking the repository**

 To get changes into LAMMPS, you need to first fork the `lammps/lammps`
-repository on GitHub. At the time of writing, *master* is the preferred
+repository on GitHub. At the time of writing, *develop* is the preferred
 target branch. Thus go to `LAMMPS on GitHub <https://github.com/lammps/lammps>`_
-and make sure branch is set to "master", as shown in the figure below.
+and make sure branch is set to "develop", as shown in the figure below.

 .. image:: JPG/tutorial_branch.png
   :align: center

-If it is not, use the button to change it to *master*\ . Once it is, use the
-fork button to create a fork.
+If it is not, use the button to change it to *develop*. Once it is, use
+the fork button to create a fork.

 .. image:: JPG/tutorial_fork.png
   :align: center
@ -64,11 +64,12 @@ LAMMPS development.
 **Adding changes to your own fork**

 Additions to the upstream version of LAMMPS are handled using *feature
-branches*\ .  For every new feature, a so-called feature branch is
+branches*.  For every new feature, a so-called feature branch is
 created, which contains only those modification relevant to one specific
 feature. For example, adding a single fix would consist of creating a
 branch with only the fix header and source file and nothing else.  It is
-explained in more detail here: `feature branch workflow <https://www.atlassian.com/git/tutorials/comparing-workflows/feature-branch-workflow>`_.
+explained in more detail here: `feature branch workflow
+<https://www.atlassian.com/git/tutorials/comparing-workflows/feature-branch-workflow>`_.

 **Feature branches**

@ -94,8 +95,8 @@ The above command copies ("clones") the git repository to your local
 machine to a directory with the name you chose. If none is given, it will
 default to "lammps". Typical names are "mylammps" or something similar.

-You can use this local clone to make changes and
-test them without interfering with the repository on GitHub.
+You can use this local clone to make changes and test them without
+interfering with the repository on GitHub.

 To pull changes from upstream into this copy, you can go to the directory
 and use git pull:
@ -103,28 +104,46 @@ and use git pull:
 .. code-block:: bash

     $ cd mylammps
-     $ git checkout master
-     $ git pull https://github.com/lammps/lammps
+     $ git checkout develop
+     $ git pull https://github.com/lammps/lammps develop

 You can also add this URL as a remote:

 .. code-block:: bash

-     $ git remote add lammps_upstream https://www.github.com/lammps/lammps
+     $ git remote add upstream https://www.github.com/lammps/lammps

-At this point, you typically make a feature branch from the updated master
+From then on you can update your upstream branches with:
+
+.. code-block:: bash
+
+     $ git fetch upstream
+
+and then refer to the upstream repository branches with
+`upstream/develop` or `upstream/release` and so on.
+
+At this point, you typically make a feature branch from the updated
 branch for the feature you want to work on. This tutorial contains the
 workflow that updated this tutorial, and hence we will call the branch
 "github-tutorial-update":

 .. code-block:: bash

-    $ git checkout -b github-tutorial-update master
+    $ git fetch upstream
+    $ git checkout -b github-tutorial-update upstream/develop

 Now that we have changed branches, we can make our changes to our local
 repository. Just remember that if you want to start working on another,
 unrelated feature, you should switch branches!

+.. note::
+
+   Committing changes to the *develop*, *release*, or *stable* branches
+   is strongly discouraged.  While it may be convenient initially, it
+   will create more work in the long run.  Various texts and tutorials
+   on using git effectively discuss the motivation for using feature
+   branches instead.
+
 **After changes are made**

 After everything is done, add the files to the branch and commit them:
@ -287,28 +306,32 @@ After each push, the automated checks are run again.

 LAMMPS developers may add labels to your pull request to assign it to
 categories (mostly for bookkeeping purposes), but a few of them are
-important: needs_work, work_in_progress, test-for-regression, and
-full-regression-test. The first two indicate, that your pull request
-is not considered to be complete. With "needs_work" the burden is on
-exclusively on you; while "work_in_progress" can also mean, that a
-LAMMPS developer may want to add changes. Please watch the comments
-to the pull requests. The two "test" labels are used to trigger
-extended tests before the code is merged. This is sometimes done by
-LAMMPS developers, if they suspect that there may be some subtle
-side effects from your changes. It is not done by default, because
-those tests are very time consuming.
+important: *needs_work*, *work_in_progress*, *run_tests*,
+*test_for_regression*, and *ready_for_merge*.  The first two indicate,
+that your pull request is not considered to be complete. With
+"needs_work" the burden is on exclusively on you; while
+"work_in_progress" can also mean, that a LAMMPS developer may want to
+add changes. Please watch the comments to the pull requests. The two
+"test" labels are used to trigger extended tests before the code is
+merged. This is sometimes done by LAMMPS developers, if they suspect
+that there may be some subtle side effects from your changes. It is not
+done by default, because those tests are very time consuming.  The
+*ready_for_merge* label is usually attached when the LAMMPS developer
+assigned to the pull request considers this request complete and to
+trigger a final full test evaluation.

 **Reviews**

-As of Summer 2018, a pull request needs at least 1 approving review
-from a LAMMPS developer with write access to the repository.
-In case your changes touch code that certain developers are associated
-with, they are auto-requested by the GitHub software.  Those associations
-are set in the file
-`.github/CODEOWNERS <https://github.com/lammps/lammps/blob/master/.github/CODEOWNERS>`_
-Thus if you want to be automatically notified to review when anybody
-changes files or packages, that you have contributed to LAMMPS, you can
-add suitable patterns to that file, or a LAMMPS developer may add you.
+As of Fall 2021, a pull request needs to pass all automatic tests and at
+least 1 approving review from a LAMMPS developer with write access to
+the repository before it is eligible for merging.  In case your changes
+touch code that certain developers are associated with, they are
+auto-requested by the GitHub software.  Those associations are set in
+the file `.github/CODEOWNERS
+<https://github.com/lammps/lammps/blob/develop/.github/CODEOWNERS>`_ Thus
+if you want to be automatically notified to review when anybody changes
+files or packages, that **you** have contributed to LAMMPS, you can add
+suitable patterns to that file, or a LAMMPS developer may add you.

 Otherwise, you can also manually request reviews from specific developers,
 or LAMMPS developers - in their assessment of your pull request - may
@ -329,7 +352,7 @@ LAMMPS developer (including him/herself) or c) Axel Kohlmeyer (akohlmey).
  After the review, the developer can choose to implement changes directly
  or suggest them to you.
 * Case c) means that the pull request has been assigned to the developer
-  overseeing the merging of pull requests into the master branch.
+  overseeing the merging of pull requests into the *develop* branch.

 In this case, Axel assigned the tutorial to Steve:

@ -351,11 +374,11 @@ Sometimes, however, you might not feel comfortable having other people
 push changes into your own branch, or maybe the maintainers are not sure
 their idea was the right one.  In such a case, they can make changes,
 reassign you as the assignee, and file a "reverse pull request", i.e.
-file a pull request in your GitHub repository to include changes in the
-branch, that you have submitted as a pull request yourself.  In that
-case, you can choose to merge their changes back into your branch,
-possibly make additional changes or corrections and proceed from there.
-It looks something like this:
+file a pull request in **your** forked GitHub repository to include
+changes in the branch, that you have submitted as a pull request
+yourself.  In that case, you can choose to merge their changes back into
+your branch, possibly make additional changes or corrections and proceed
+from there.  It looks something like this:

 .. image:: JPG/tutorial_reverse_pull_request.png
   :align: center
@ -419,7 +442,7 @@ This merge also shows up on the lammps GitHub page:

 **After a merge**

-When everything is fine, the feature branch is merged into the master branch:
+When everything is fine, the feature branch is merged into the *develop* branch:

 .. image:: JPG/tutorial_merged.png
   :align: center
@ -433,8 +456,8 @@ branch!

 .. code-block:: bash

-   $ git checkout master
-   $ git pull master
+   $ git checkout develop
+   $ git pull https://github.com/lammps/lammps develop
   $ git branch -d github-tutorial-update

 If you do not pull first, it is not really a problem but git will warn
@ -442,6 +465,7 @@ you at the next statement that you are deleting a local branch that
 was not yet fully merged into HEAD. This is because git does not yet
 know your branch just got merged into LAMMPS upstream. If you
 first delete and then pull, everything should still be fine.
+You can display all branches that are fully merged by:

 Finally, if you delete the branch locally, you might want to push this
 to your remote(s) as well:
@ -453,14 +477,14 @@ to your remote(s) as well:
 **Recent changes in the workflow**

 Some changes to the workflow are not captured in this tutorial.  For
-example, in addition to the master branch, to which all new features
-should be submitted, there is now also an "unstable" and a "stable"
-branch; these have the same content as "master", but are only updated
-after a patch release or stable release was made.
-Furthermore, the naming of the patches now follow the pattern
-"patch_<Day><Month><Year>" to simplify comparisons between releases.
-Finally, all patches and submissions are subject to automatic testing
-and code checks to make sure they at the very least compile.
+example, in addition to the *develop* branch, to which all new features
+should be submitted, there is also a *release* and a *stable* branch;
+these have the same content as *develop*, but are only updated after a
+patch release or stable release was made.  Furthermore, the naming of
+the patches now follow the pattern "patch_<Day><Month><Year>" to
+simplify comparisons between releases.  Finally, all patches and
+submissions are subject to automatic testing and code checks to make
+sure they at the very least compile.

 A discussion of the LAMMPS developer GitHub workflow can be found in the file
-`doc/github-development-workflow.md <https://github.com/lammps/lammps/blob/master/doc/github-development-workflow.md>`_
+`doc/github-development-workflow.md <https://github.com/lammps/lammps/blob/develop/doc/github-development-workflow.md>`_
--- a/doc/src/Install_git.rst
+++ b/doc/src/Install_git.rst
@ -9,7 +9,8 @@ has several advantages:
  command.
 * You can create your own development branches to add code to LAMMPS.
 * You can submit your new features back to GitHub for inclusion in
-  LAMMPS.
+  LAMMPS.  For that you should first create your own :doc:`fork on
+  GitHub <Howto_github>`.

 You must have `git <git_>`_ installed on your system to use the
 commands explained below to communicate with the git servers on
@ -20,35 +21,56 @@ provides `limited support for subversion clients <svn_>`_.

   As of October 2016, the official home of public LAMMPS development is
   on GitHub.  The previously advertised LAMMPS git repositories on
-   git.lammps.org and bitbucket.org are now deprecated or offline.
+   git.lammps.org and bitbucket.org are now offline or deprecated.

 .. _git: https://git-scm.com
 .. _svn: https://help.github.com/en/github/importing-your-projects-to-github/working-with-subversion-on-github

-You can follow LAMMPS development on 3 different git branches:
+You can follow the LAMMPS development on 3 different git branches:

-* **stable**   :  this branch is updated with every stable release
-* **unstable** :  this branch is updated with every patch release
-* **master**   :  this branch continuously follows ongoing development
+* **stable**   :  this branch is updated from the *release* branch with
+  every stable release version and also has selected bug fixes and updates
+  back-ported from the *develop* branch
+* **release**  :  this branch is updated with every patch release;
+  updates are always "fast forward" merges from *develop*
+* **develop**  :  this branch follows the ongoing development and
+  is updated with every merge commit of a pull request

 To access the git repositories on your box, use the clone command to
 create a local copy of the LAMMPS repository with a command like:

 .. code-block:: bash

-   $ git clone -b unstable https://github.com/lammps/lammps.git mylammps
+   $ git clone -b release https://github.com/lammps/lammps.git mylammps

 where "mylammps" is the name of the directory you wish to create on
-your machine and "unstable" is one of the 3 branches listed above.
+your machine and "release" is one of the 3 branches listed above.
 (Note that you actually download all 3 branches; you can switch
 between them at any time using "git checkout <branch name>".)

+.. admonition:: Saving time and disk space when using ``git clone``
+
+   The complete git history of the LAMMPS project is quite large because
+   it contains the entire commit history of the project since fall 2006,
+   which includes the time when LAMMPS was managed with subversion.
+   This includes a few commits that have added and removed some large
+   files (mostly by accident).  If you do not need access to the entire
+   commit history (most people don't), you can speed up the "cloning"
+   process and reduce local disk space requirements by using the
+   *--depth* git command line flag.  That will create a "shallow clone"
+   of the repository containing only a subset of the git history.  Using
+   a depth of 1000 is usually sufficient to include the head commits of
+   the *develop* and the *release* branches.  To include the head commit
+   of the *stable* branch you may need a depth of up to 10000.  If you
+   later need more of the git history, you can always convert the
+   shallow clone into a "full clone".
+
 Once the command completes, your directory will contain the same files
 as if you unpacked a current LAMMPS tarball, with the exception, that
 the HTML documentation files are not included.  They can be fetched
 from the LAMMPS website by typing ``make fetch`` in the doc directory.
-Or they can be generated from the content provided in doc/src by
-typing ``make html`` from the doc directory.
+Or they can be generated from the content provided in ``doc/src`` by
+typing ``make html`` from the ``doc`` directory.

 After initial cloning, as bug fixes and new features are added to
 LAMMPS you can stay up-to-date by typing the following git commands
@ -56,9 +78,9 @@ from within the "mylammps" directory:

 .. code-block:: bash

-   $ git checkout unstable      # not needed if you always stay in this branch
-   $ git checkout stable        # use one of these 3 checkout commands
-   $ git checkout master        # to choose the branch to follow
+   $ git checkout release      # not needed if you always stay in this branch
+   $ git checkout stable       # use one of these 3 checkout commands
+   $ git checkout develop      # to choose the branch to follow
   $ git pull

 Doing a "pull" will not change any files you have added to the LAMMPS
@ -81,7 +103,7 @@ Stable versions and what tagID to use for a particular stable version
 are discussed on `this page <https://www.lammps.org/bug.html#version>`_.
 Note that this command will print some warnings, because in order to get
 back to the latest revision and to be able to update with ``git pull``
-again, you will need to do ``git checkout unstable`` (or
+again, you will need to do ``git checkout release`` (or
 check out any other desired branch) first.

 Once you have updated your local files with a ``git pull`` (or ``git
@ -137,9 +159,9 @@ changed.  How to do this depends on the build system you are using.
 .. admonition:: Git protocols
   :class: note

-   The servers at github.com support the "git://" and "https://" access
-   protocols for anonymous, read-only access.  If you have a suitably
-   configured GitHub account, you may also use SSH protocol with the
+   The servers at github.com support the "https://" access protocol for
+   anonymous, read-only access.  If you have a suitably configured GitHub
+   account, you may also use SSH protocol with the
   URL "git@github.com:lammps/lammps.git".

 The LAMMPS GitHub project is currently managed by Axel Kohlmeyer
--- a/doc/src/Intro_citing.rst
+++ b/doc/src/Intro_citing.rst
@ -4,28 +4,41 @@ Citing LAMMPS
 Core Algorithms
 ^^^^^^^^^^^^^^^

-Since LAMMPS is a community project, there is not a single one
-publication or reference that describes **all** of LAMMPS.
-The canonical publication that describes the foundation, that is
-the basic spatial decomposition approach, the neighbor finding,
-and basic communications algorithms used in LAMMPS is:
+The paper mentioned below is the best overview of LAMMPS, but there are
+also publications describing particular models or algorithms implemented
+in LAMMPS or complementary software that is has interfaces to.  Please
+see below for how to cite contributions to LAMMPS.

- `S. Plimpton, Fast Parallel Algorithms for Short-Range Molecular Dynamics, J Comp Phys, 117, 1-19 (1995). <http://www.sandia.gov/~sjplimp/papers/jcompphys95.pdf>`_
+.. _lammps_paper:

-So any project using LAMMPS (or a derivative application using LAMMPS as
-a simulation engine) should cite this paper. A new publication
-describing the developments and improvements of LAMMPS in the 25 years
-since then is currently in preparation.
+The latest canonical publication that describes the basic features, the
+source code design, the program structure, the spatial decomposition
+approach, the neighbor finding, basic communications algorithms, and how
+users and developers have contributed to LAMMPS is:
+
+  `LAMMPS - A flexible simulation tool for particle-based materials modeling at the atomic, meso, and continuum scales, Comp. Phys. Comm. 271, 108171 (2022) <https://doi.org/10.1016/j.cpc.2021.108171>`_
+
+So a project using LAMMPS or a derivative application that uses LAMMPS
+as a simulation engine should cite this paper.  The paper is expected to
+be published in its final form under the same DOI in the first half
+of 2022.  Please also give the URL of the LAMMPS website in your paper,
+namely https://www.lammps.org.
+
+The original publication describing the parallel algorithms used in the
+initial versions of LAMMPS is:
+
+  `S. Plimpton, Fast Parallel Algorithms for Short-Range Molecular Dynamics, J Comp Phys, 117, 1-19 (1995). <http://www.sandia.gov/~sjplimp/papers/jcompphys95.pdf>`_


 DOI for the LAMMPS code
 ^^^^^^^^^^^^^^^^^^^^^^^

-LAMMPS developers use the `Zenodo service at CERN
-<https://zenodo.org/>`_ to create digital object identifies (DOI) for
-stable releases of the LAMMPS code. There are two types of DOIs for the
-LAMMPS source code: the canonical DOI for **all** versions of LAMMPS,
-which will always point to the **latest** stable release version is:
+LAMMPS developers use the `Zenodo service at CERN <https://zenodo.org/>`_
+to create digital object identifies (DOI) for stable releases of the
+LAMMPS source code. There are two types of DOIs for the LAMMPS source code.
+
+The canonical DOI for **all** versions of LAMMPS, which will always
+point to the **latest** stable release version is:

 - DOI: `10.5281/zenodo.3726416 <https://dx.doi.org/10.5281/zenodo.3726416>`_

@ -45,11 +58,13 @@ about LAMMPS and its features.
 Citing contributions
 ^^^^^^^^^^^^^^^^^^^^

-LAMMPS has many features and that use either previously published
-methods and algorithms or novel features.  It also includes potential
-parameter filed for specific models.  Where available, a reminder about
-references for optional features used in a specific run is printed to
-the screen and log file.  Style and output location can be selected with
-the :ref:`-cite command-line switch <cite>`.  Additional references are
+LAMMPS has many features that use either previously published methods
+and algorithms or novel features.  It also includes potential parameter
+files for specific models.  Where available, a reminder about references
+for optional features used in a specific run is printed to the screen
+and log file.  Style and output location can be selected with the
+:ref:`-cite command-line switch <cite>`.  Additional references are
 given in the documentation of the :doc:`corresponding commands
-<Commands_all>` or in the :doc:`Howto tutorials <Howto>`.
+<Commands_all>` or in the :doc:`Howto tutorials <Howto>`.  So please
+make certain, that you provide the proper acknowledgments and citations
+in any published works using LAMMPS.
--- a/doc/src/Intro_opensource.rst
+++ b/doc/src/Intro_opensource.rst
@ -19,7 +19,7 @@ software and open-source distribution, see `www.gnu.org <gnuorg_>`_
 or `www.opensource.org <opensource_>`_.  The legal text of the GPL as it
 applies to LAMMPS is in the LICENSE file included in the LAMMPS distribution.

-.. _gpl: https://github.com/lammps/lammps/blob/master/LICENSE
+.. _gpl: https://github.com/lammps/lammps/blob/develop/LICENSE

 .. _lgpl: https://www.gnu.org/licenses/old-licenses/lgpl-2.1.html

--- a/doc/src/Library_create.rst
+++ b/doc/src/Library_create.rst
@ -34,7 +34,7 @@ simple example demonstrating its use:
     int lmpargc = sizeof(lmpargv)/sizeof(const char *);

     /* create LAMMPS instance */
-     handle = lammps_open_no_mpi(lmpargc, lmpargv, NULL);
+     handle = lammps_open_no_mpi(lmpargc, (char **)lmpargv, NULL);
     if (handle == NULL) {
       printf("LAMMPS initialization failed");
       lammps_mpi_finalize();
--- a/doc/src/Manual_version.rst
+++ b/doc/src/Manual_version.rst
@ -7,26 +7,34 @@ correctly and reliably at all times.  You can follow its development
 in a public `git repository on GitHub <https://github.com/lammps/lammps>`_.

 Whenever we fix a bug or update or add a feature, it will be merged into
-the `master` branch of the git repository.  When a sufficient number of
+the *develop* branch of the git repository.  When a sufficient number of
 changes have accumulated *and* the software passes a set of automated
 tests, we release it in the next *patch* release, which are made every
-few weeks.  Info on patch releases are on `this website page
+few weeks.  The *release* branch of the git repository is updated with
+every such release.  Info on patch releases are on `this website page
 <https://www.lammps.org/bug.html>`_.

-Once or twice a year, only bug fixes and small, non-intrusive changes are
-included for a period of time, and the code is subjected to more detailed
+Once or twice a year, we apply only bug fixes and small, non-intrusive
+changes to the *develop* branch and the code is subjected to more detailed
 and thorough testing than the default automated testing.  The latest
-patch release after such a period is then labeled as a *stable* version.
+patch release after such a period is then also labeled as a *stable* version
+and the *stable* branch is updated with it.  Between stable releases
+we occasionally release some updates to the stable release containing
+only bug fixes and updates back-ported from *develop* but no new features
+and update the *stable* branch accordingly.

-Each version of LAMMPS contains all the features and bug-fixes up to
-and including its version date.
+Each version of LAMMPS contains all the documented features up to and
+including its version date.

 The version date is printed to the screen and logfile every time you
 run LAMMPS. It is also in the file src/version.h and in the LAMMPS
 directory name created when you unpack a tarball.  And it is on the
 first page of the :doc:`manual <Manual>`.

-* If you browse the HTML pages on the LAMMPS WWW site, they always
-  describe the most current patch release of LAMMPS.
+* If you browse the HTML pages on the LAMMPS WWW site, they will by
+  default describe the most current patch release version of LAMMPS.
+  In the navigation bar on the bottom left, there is the option to
+  view instead the documentation for the most recent *stable* version
+  or the latest version from the current development branch.
 * If you browse the HTML pages included in your tarball, they
  describe the version you have, which may be older.
--- a/doc/src/Modify_pair.rst
+++ b/doc/src/Modify_pair.rst
@ -12,24 +12,24 @@ includes some optional methods to enable its use with rRESPA.

 Here is a brief description of the class methods in pair.h:

-+---------------------------------+-------------------------------------------------------------------+
-| compute                         | workhorse routine that computes pairwise interactions             |
-+---------------------------------+-------------------------------------------------------------------+
-| settings                        | reads the input script line with arguments you define             |
-+---------------------------------+-------------------------------------------------------------------+
-| coeff                           | set coefficients for one i,j type pair                            |
-+---------------------------------+-------------------------------------------------------------------+
-| init_one                        | perform initialization for one i,j type pair                      |
-+---------------------------------+-------------------------------------------------------------------+
-| init_style                      | initialization specific to this pair style                        |
-+---------------------------------+-------------------------------------------------------------------+
-| write & read_restart            | write/read i,j pair coeffs to restart files                       |
-+---------------------------------+-------------------------------------------------------------------+
-| write & read_restart_settings   | write/read global settings to restart files                       |
-+---------------------------------+-------------------------------------------------------------------+
-| single                          | force and energy of a single pairwise interaction between 2 atoms |
-+---------------------------------+-------------------------------------------------------------------+
-| compute_inner/middle/outer      | versions of compute used by rRESPA                                |
-+---------------------------------+-------------------------------------------------------------------+
+---------------------------------+---------------------------------------------------------------------+
+| compute                         | workhorse routine that computes pairwise interactions               |
+---------------------------------+---------------------------------------------------------------------+
+| settings                        | reads the input script line with arguments you define               |
+---------------------------------+---------------------------------------------------------------------+
+| coeff                           | set coefficients for one i,j type pair                              |
+---------------------------------+---------------------------------------------------------------------+
+| init_one                        | perform initialization for one i,j type pair                        |
+---------------------------------+---------------------------------------------------------------------+
+| init_style                      | initialization specific to this pair style                          |
+---------------------------------+---------------------------------------------------------------------+
+| write & read_restart            | write/read i,j pair coeffs to restart files                         |
+---------------------------------+---------------------------------------------------------------------+
+| write & read_restart_settings   | write/read global settings to restart files                         |
+---------------------------------+---------------------------------------------------------------------+
+| single                          | force/r and energy of a single pairwise interaction between 2 atoms |
+---------------------------------+---------------------------------------------------------------------+
+| compute_inner/middle/outer      | versions of compute used by rRESPA                                  |
+---------------------------------+---------------------------------------------------------------------+

 The inner/middle/outer routines are optional.
--- a/doc/src/PDF/colvars-refman-lammps.pdf
+++ b/doc/src/PDF/colvars-refman-lammps.pdf
--- a/doc/src/Run_basics.rst
+++ b/doc/src/Run_basics.rst
@ -2,17 +2,25 @@ Basics of running LAMMPS
 ========================

 LAMMPS is run from the command line, reading commands from a file via
-the -in command line flag, or from standard input.
-Using the "-in in.file" variant is recommended:
+the -in command line flag, or from standard input.  Using the "-in
+in.file" variant is recommended (see note below).  The name of the
+LAMMPS executable is either ``lmp`` or ``lmp_<machine>`` with
+`<machine>` being the machine string used when compiling LAMMPS.  This
+is required when compiling LAMMPS with the traditional build system
+(e.g. with ``make mpi``), but optional when using CMake to configure and
+build LAMMPS:

 .. code-block:: bash

   $ lmp_serial -in in.file
   $ lmp_serial < in.file
+   $ lmp -in in.file
+   $ lmp < in.file
   $ /path/to/lammps/src/lmp_serial -i in.file
   $ mpirun -np 4 lmp_mpi -in in.file
+   $ mpiexec -np 4 lmp -in in.file
   $ mpirun -np 8 /path/to/lammps/src/lmp_mpi -in in.file
-   $ mpirun -np 6 /usr/local/bin/lmp -in in.file
+   $ mpiexec -n 6 /usr/local/bin/lmp -in in.file

 You normally run the LAMMPS command in the directory where your input
 script is located.  That is also where output files are produced by
@ -23,7 +31,7 @@ executable itself can be placed elsewhere.
 .. note::

   The redirection operator "<" will not always work when running
-   in parallel with mpirun; for those systems the -in form is required.
+   in parallel with mpirun or mpiexec; for those systems the -in form is required.

 As LAMMPS runs it prints info to the screen and a logfile named
 *log.lammps*\ .  More info about output is given on the
--- a/doc/src/Tools.rst
+++ b/doc/src/Tools.rst
@ -278,16 +278,20 @@ eam database tool
 -----------------------------

 The tools/eam_database directory contains a Fortran program that will
-generate EAM alloy setfl potential files for any combination of 16
+generate EAM alloy setfl potential files for any combination of 17
 elements: Cu, Ag, Au, Ni, Pd, Pt, Al, Pb, Fe, Mo, Ta, W, Mg, Co, Ti,
-Zr.  The files can then be used with the :doc:`pair_style eam/alloy <pair_eam>` command.
+Zr, Cr.  The files can then be used with the :doc:`pair_style eam/alloy <pair_eam>` command.

 The tool is authored by Xiaowang Zhou (Sandia), xzhou at sandia.gov,
-and is based on his paper:
+with updates from Lucas Hale (NIST) lucas.hale at nist.gov and is based on his paper:

 X. W. Zhou, R. A. Johnson, and H. N. G. Wadley, Phys. Rev. B, 69,
 144113 (2004).

+The parameters for Cr were taken from:
+
+Lin Z B, Johnson R A and Zhigilei L V, Phys. Rev. B 77 214108 (2008).
+
 ----------

 .. _eamgn:
--- a/doc/src/dump_modify.rst
+++ b/doc/src/dump_modify.rst
@ -1011,7 +1011,9 @@ can be disabled with the :code:`checksum` parameter.

 Restrictions
 """"""""""""
- none
+
+*dump_modify sort* is not supported for dumps of groups containing
+more than 2 billion atoms.

 Related commands
 """"""""""""""""
--- a/doc/src/group.rst
+++ b/doc/src/group.rst
@ -38,7 +38,7 @@ Syntax
       *intersect* args = two or more group IDs
       *dynamic* args = parent-ID keyword value ...
         one or more keyword/value pairs may be appended
-         keyword = *region* or *var* or *every*
+         keyword = *region* or *var* or *property* or *every*
           *region* value = region-ID
           *var* value = name of variable
           *property* value = name of custom integer or floating point vector
--- a/doc/src/img/decomp-balance.png
+++ b/doc/src/img/decomp-balance.png
--- a/doc/src/img/decomp-processors.png
+++ b/doc/src/img/decomp-processors.png
--- a/doc/src/img/decomp-rcb.png
+++ b/doc/src/img/decomp-rcb.png
--- a/doc/src/img/decomp-regular.png
+++ b/doc/src/img/decomp-regular.png
--- a/doc/src/img/domain-decomp.png
+++ b/doc/src/img/domain-decomp.png
--- a/doc/src/img/fft-decomp-parallel.png
+++ b/doc/src/img/fft-decomp-parallel.png
--- a/doc/src/img/ghost-comm.png
+++ b/doc/src/img/ghost-comm.png
--- a/doc/src/img/neigh-stencil.png
+++ b/doc/src/img/neigh-stencil.png
--- a/doc/src/kspace_style.rst
+++ b/doc/src/kspace_style.rst
@ -310,7 +310,7 @@ Forschungszentrum Juelich.

 The library is available for download at "http://scafacos.de" or can
 be cloned from the git-repository
-"git://github.com/scafacos/scafacos.git".
+"https://github.com/scafacos/scafacos.git".

 In order to use this KSpace style, you must download and build the
 ScaFaCoS library, then build LAMMPS with the SCAFACOS package
--- a/doc/src/pair_granular.rst
+++ b/doc/src/pair_granular.rst
@ -205,7 +205,7 @@ For *damping mass_velocity*, the normal damping is given by:
   \eta_n = \eta_{n0} m_{eff}

 Here, :math:`\eta_{n0}` is the damping coefficient specified for the normal
-contact model, in units of *mass*\ /\ *time* and
+contact model, in units of 1/\ *time* and
 :math:`m_{eff} = m_i m_j/(m_i + m_j)` is the effective mass.
 Use *damping mass_velocity* to reproduce the damping behavior of
 *pair gran/hooke/\**.
--- a/doc/src/pair_lebedeva_z.rst
+++ b/doc/src/pair_lebedeva_z.rst
@ -26,15 +26,29 @@ Examples
 Description
 """""""""""

-The *lebedeva/z* style computes the Lebedeva interaction
-potential as described in :ref:`(Lebedeva et al.) <Leb01>`. An important simplification is made,
-which is to take all normals along the z-axis.
+The *lebedeva/z* pair style computes the Lebedeva interaction potential
+as described in :ref:`(Lebedeva1) <Leb01>` and :ref:`(Lebedeva2)
+<Leb02>`.  An important simplification is made, which is to take all
+normals along the z-axis.
+
+The Lebedeva potential is intended for the description of the interlayer
+interaction between graphene layers.  To perform a realistic simulation,
+this potential must be used in combination with an intralayer potential
+such as :doc:`AIREBO <pair_airebo>` or :doc:`Tersoff <pair_tersoff>`
+facilitated by using pair style :doc:`hybrid/overlay <pair_hybrid>`.  To
+keep the intralayer properties unaffected, the interlayer interaction
+within the same layers should be avoided.  This can be achieved by
+assigning different atom types to atoms of different layers (e.g. 1 and
+2 in the examples above).
+
+Other interactions can be set to zero using pair_style *none*\ .
+

 .. math::

-   E       = & \frac{1}{2} \sum_i \sum_{i \neq j} V_{ij}\\
+   E       = & \frac{1}{2} \sum_i \sum_{j \neq i} V_{ij}\\
   V_{ij}  = & B e^{-\alpha(r_{ij} - z_0)} \\
-             & + C(1 + D_1\rho^2_{ij} + D_2\rho^4_{ij} e^{-\lambda_1\rho^2_{ij}} e^{-\lambda_2 (z^2_{ij} - z^2_0)} \\
+             & + C(1 + D_1\rho^2_{ij} + D_2\rho^4_{ij}) e^{-\lambda_1\rho^2_{ij}} e^{-\lambda_2 (z^2_{ij} - z^2_0)} \\
             & - A \left(\frac{z_0}{r_ij}\right)^6 + A \left( \frac{z_0}{r_c} \right)^6 \\
   \rho^2_{ij} = & x^2_{ij} + y^2_{ij} \qquad (\mathbf{n_i} \equiv \mathbf{\hat{z}})

@ -43,12 +57,15 @@ Energies are shifted so that they go continuously to zero at the cutoff assuming
 that the exponential part of :math:`V_{ij}` (first term) decays sufficiently fast.
 This shift is achieved by the last term in the equation for :math:`V_{ij}` above.

-The parameter file (e.g. CC.Lebedeva), is intended for use with metal
-:doc:`units <units>`, with energies in meV. An additional parameter, *S*,
-is available to facilitate scaling of energies.
+The provided parameter file (CC.Lebedeva) contains two sets of parameters.

-This potential must be used in combination with hybrid/overlay.
-Other interactions can be set to zero using pair_style *none*\ .
+- The first set (element name "C") is suitable for normal conditions and
+  is taken from :ref:`(Popov1) <Popov>`
+- The second set (element name "C1") is suitable for high-pressure
+  conditions and is taken from :ref:`(Koziol1) <Koziol>`
+
+Both sets contain an additional parameter, *S*, that can be used to
+facilitate scaling of energies and is set to 1.0 by default.

 Restrictions
 """"""""""""
@ -77,4 +94,16 @@ none

 .. _Leb01:

-**(Lebedeva et al.)** I. V. Lebedeva, A. A. Knizhnik, A. M. Popov, Y. E. Lozovik, B. V. Potapkin, Phys. Rev. B, 84, 245437 (2011)
+**(Lebedeva1)** I. V. Lebedeva, A. A. Knizhnik, A. M. Popov, Y. E. Lozovik, B. V. Potapkin, Phys. Rev. B, 84, 245437 (2011)
+
+.. _Leb02:
+
+**(Lebedeva2)** I. V. Lebedeva, A. A. Knizhnik, A. M. Popov, Y. E. Lozovik, B. V. Potapkin, Physica E: 44, 949-954 (2012)
+
+.. _Popov:
+
+**(Popov1)** A.M. Popov, I. V. Lebedeva, A. A. Knizhnik, Y. E. Lozovik and B. V. Potapkin, Chem. Phys. Lett. 536, 82-86 (2012).
+
+.. _Koziol:
+
+**(Koziol1)** Z. Koziol, G. Gawlik and J. Jagielski, Chinese Phys. B 28, 096101 (2019).
--- a/doc/src/pair_local_density.rst
+++ b/doc/src/pair_local_density.rst
@ -26,23 +26,25 @@ Examples
 Description
 """""""""""

-The local density (LD) potential is a mean-field manybody potential, and, in some
-sense,a generalization of embedded atom models (EAM). The name "local density
-potential" arises from the fact that it assigns an energy to an atom depending
-on the number of neighboring atoms of given type around it within a predefined
-spherical volume (i.e., within a cutoff). The bottom-up coarse-graining (CG)
-literature suggests that such potentials can be widely useful  in capturing
-effective multibody forces in a computationally efficient manner so as to
-improve the quality of CG models of implicit solvation:ref:`(Sanyal1) <Sanyal1>` and
-phase-segregation in liquid mixtures:ref:`(Sanyal2) <Sanyal2>`, and provide guidelines
-to determine the extent of manybody correlations present in a CG
-model.:ref:`(Rosenberger) <Rosenberger>` The LD potential in LAMMPS is primarily
-intended to be used as a corrective potential over traditional pair potentials
-in bottom-up CG models, i.e., as a hybrid pair style with
-other explicit pair interaction terms (e.g., table spline, Lennard Jones, etc.).
-Because the LD potential is not a pair potential per se,  it is implemented
-simply as a single auxiliary file with all specifications that will be read
-upon initialization.
+The local density (LD) potential is a mean-field manybody potential,
+and, in some way, a generalization of embedded atom models (EAM).  The
+name "local density potential" arises from the fact that it assigns an
+energy to an atom depending on the number of neighboring atoms of a
+given type around it within a predefined spherical volume (i.e., within
+the cutoff).  The bottom-up coarse-graining (CG) literature suggests
+that such potentials can be widely useful in capturing effective
+multibody forces in a computationally efficient manner and thus improve
+the quality of CG models of implicit solvation :ref:`(Sanyal1)
+<Sanyal1>` and phase-segregation in liquid mixtures :ref:`(Sanyal2)
+<Sanyal2>`, and provide guidelines to determine the extent of manybody
+correlations present in a CG model :ref:`(Rosenberger) <Rosenberger>`.
+The LD potential in LAMMPS is primarily intended to be used as a
+corrective potential over traditional pair potentials in bottom-up CG
+models via :doc:`hybrid/overlay pair style <pair_hybrid>` with other
+explicit pair interaction terms (e.g., tabulated, Lennard-Jones, Morse
+etc.).  Because the LD potential is not a pair potential per se, it is
+implemented simply as a single auxiliary file with all specifications
+that will be read upon initialization.

 .. note::

--- a/doc/src/pair_python.rst
+++ b/doc/src/pair_python.rst
@ -126,11 +126,11 @@ and *compute_energy*, which both take 3 numerical arguments:
 * itype = the (numerical) type of the first atom
 * jtype = the (numerical) type of the second atom

-This functions need to compute the force and the energy, respectively,
-and use the result as return value. The functions need to use the
-*pmap* dictionary to convert the LAMMPS atom type number to the symbolic
-value of the internal potential parameter data structure. Following
-the *LJCutMelt* example, here are the two functions:
+This functions need to compute the (scaled) force and the energy,
+respectively, and use the result as return value. The functions need
+to use the *pmap* dictionary to convert the LAMMPS atom type number
+to the symbolic value of the internal potential parameter data structure.
+Following the *LJCutMelt* example, here are the two functions:

 .. code-block:: python

@ -154,10 +154,10 @@ the *LJCutMelt* example, here are the two functions:

   for consistency with the C++ pair styles in LAMMPS, the
   *compute_force* function follows the conventions of the Pair::single()
-   methods and does not return the full force, but the force scaled by
-   the distance between the two atoms, so this value only needs to be
-   multiplied by delta x, delta y, and delta z to conveniently obtain the
-   three components of the force vector between these two atoms.
+   methods and does not return the pairwise force directly, but the force
+   divided by the distance between the two atoms, so this value only needs
+   to be  multiplied by delta x, delta y, and delta z to conveniently obtain
+   the three components of the force vector between these two atoms.

 ----------

--- a/doc/utils/requirements.txt
+++ b/doc/utils/requirements.txt
@ -1,7 +1,7 @@
-Sphinx==4.0.3
-sphinxcontrib-spelling
-git+git://github.com/akohlmey/sphinx-fortran@parallel-read
-sphinx_tabs
-breathe
-Pygments
-six
+Sphinx==4.2.0
+sphinxcontrib-spelling==7.2.1
+git+https://github.com/akohlmey/sphinx-fortran@parallel-read
+sphinx_tabs==3.2.0
+breathe==4.31.0
+Pygments==2.10.0
+six==1.16.0
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@ -1122,6 +1122,7 @@ gaussian
 gaussians
 Gaussians
 Gavhane
+Gawlik
 gayberne
 gcc
 gcmc
@ -1476,6 +1477,7 @@ Izz
 Jacobsen
 Jadhao
 Jadhav
+Jagielski
 jagreat
 Jahn
 Jalalvand
@ -1602,6 +1604,7 @@ Koslowski
 Kosovan
 Koster
 Kosztin
+Koziol
 Kp
 kradius
 Kraker
--- a/examples/PACKAGES/charge_regulation/in.chreg-polymer
+++ b/examples/PACKAGES/charge_regulation/in.chreg-polymer
@ -8,7 +8,7 @@ bond_style      harmonic
 bond_coeff      1 100 1.122462 # K R0
 velocity        all create 1.0 8008 loop geom

-pair_style      lj/cut/coul/long 1.122462 20
+pair_style      lj/cut/coul/long/soft 2 0.5 10.0  1.122462 20
 pair_coeff      * *  1.0 1.0 1.122462 # charges
 kspace_style    pppm 1.0e-3
 pair_modify     shift yes
--- a/examples/PACKAGES/extep/BN.extep
+++ b/examples/PACKAGES/extep/BN.extep
@ -0,0 +1 @@
+../../../potentials/BN.extep
--- a/examples/PACKAGES/extep/in.extep-bn
+++ b/examples/PACKAGES/extep/in.extep-bn
@ -15,7 +15,7 @@ neigh_modify    check yes

 # Potential
 pair_style      extep
-pair_coeff      * * ../../../../potentials/BN.extep B N
+pair_coeff      * * BN.extep B N

 # Output 
 thermo          10
--- a/examples/PACKAGES/local_density/benzene_water/in.benzene_water
+++ b/examples/PACKAGES/local_density/benzene_water/in.benzene_water
--- a/examples/PACKAGES/local_density/benzene_water/log.04Sep19.g++.1
+++ b/examples/PACKAGES/local_density/benzene_water/log.04Sep19.g++.1
@ -1,267 +0,0 @@
-LAMMPS (7 Aug 2019)
-# LAMMPS input file for 26.5% benzene mole fraction solution
-# with 380 benzene and 1000 water molecules,
-# using all possible local density potentials
-# between benzene and water
-#
-# Author: Tanmoy Sanyal, Shell Group, UC Santa Barbara
-#
-# Refer: Sanyal and Shell, JPC-B, 2018, 122 (21), 5678-5693
-
-
-
-# Initialize simulation box
-dimension       3
-boundary        p p p
-units           real
-atom_style      molecular
-
-# Set potential styles
-pair_style      hybrid/overlay table spline 500 local/density
-
-# Read molecule data and set initial velocities
-read_data       benzene_water.data
-  orthogonal box = (-12.865 -12.865 -64.829) to (12.865 12.865 64.829)
-  1 by 1 by 8 MPI processor grid
-  reading atoms ...
-  1380 atoms
-  0 = max # of 1-2 neighbors
-  0 = max # of 1-3 neighbors
-  0 = max # of 1-4 neighbors
-  1 = max # of special neighbors
-  special bonds CPU = 0.000566959 secs
-  read_data CPU = 0.00661397 secs
-velocity        all create  3.0000e+02 16611 rot yes dist gaussian
-
-# Assign potentials
-pair_coeff          1     1    table          benzene_water.pair.table      PairBB
-WARNING: 33 of 500 force values in table are inconsistent with -dE/dr.
-  Should only be flagged at inflection points (../pair_table.cpp:483)
-WARNING: 150 of 500 distance values in table with relative error
-  over 1e-06 to re-computed values (../pair_table.cpp:492)
-pair_coeff          1     2    table          benzene_water.pair.table      PairWW
-WARNING: 61 of 500 force values in table are inconsistent with -dE/dr.
-  Should only be flagged at inflection points (../pair_table.cpp:483)
-WARNING: 90 of 500 distance values in table with relative error
-  over 1e-06 to re-computed values (../pair_table.cpp:492)
-pair_coeff          2     2    table          benzene_water.pair.table      PairBW
-WARNING: 108 of 500 force values in table are inconsistent with -dE/dr.
-  Should only be flagged at inflection points (../pair_table.cpp:483)
-WARNING: 135 of 500 distance values in table with relative error
-  over 1e-06 to re-computed values (../pair_table.cpp:492)
-pair_coeff          *     *    local/density  benzene_water.localdensity.table
-
-# Recentering during minimization and equilibration
-fix recentering all recenter 0.0 0.0 0.0 units box
-
-# Thermostat & time integration
-timestep        2.0
-thermo          100
-thermo_style    custom temp ke pe etotal ebond eangle edihed evdwl
-
-# Minimization
-minimize        1.e-4 0.0 10000 10000
-WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (../min.cpp:168)
-Neighbor list info ...
-  update every 1 steps, delay 0 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 15.25
-  ghost atom cutoff = 15.25
-  binsize = 7.625, bins = 4 4 18
-  2 neighbor lists, perpetual/occasional/extra = 2 0 0
-  (1) pair table, perpetual
-      attributes: half, newton on
-      pair build: half/bin/newton
-      stencil: half/bin/3d/newton
-      bin: standard
-  (2) pair local/density, perpetual, copy from (1)
-      attributes: half, newton on
-      pair build: copy
-      stencil: none
-      bin: none
-Per MPI rank memory allocation (min/avg/max) = 8.061 | 8.32 | 8.674 Mbytes
-Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
-         300    1233.1611    4162.3053    5395.4665            0            0            0    4162.3053 
-         300    1233.1611     2275.526    3508.6871            0            0            0     2275.526 
-Loop time of 0.352822 on 8 procs for 40 steps with 1380 atoms
-
-71.3% CPU use with 8 MPI tasks x no OpenMP threads
-
-Minimization stats:
-  Stopping criterion = linesearch alpha is zero
-  Energy initial, next-to-last, final = 
-         4162.30533361      2208.86525108      2275.52597861
-  Force two-norm initial, final = 259.364 69.3915
-  Force max component initial, final = 22.2077 8.31436
-  Final line search alpha, max atom move = 2.90022e-12 2.41135e-11
-  Iterations, force evaluations = 40 110
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
---------------------------------------------------------------
-Pair    | 0.053192   | 0.23903    | 0.32779    |  17.2 | 67.75
-Bond    | 9.0599e-06 | 1.6302e-05 | 2.5272e-05 |   0.0 |  0.00
-Neigh   | 0.00044513 | 0.0023614  | 0.0063851  |   5.1 |  0.67
-Comm    | 0.015469   | 0.090432   | 0.20295    |  20.0 | 25.63
-Output  | 0          | 0          | 0          |   0.0 |  0.00
-Modify  | 0          | 0          | 0          |   0.0 |  0.00
-Other   |            | 0.02098    |            |       |  5.95
-
-Nlocal:    172.5 ave 348 max 72 min
-Histogram: 5 0 0 0 0 0 0 0 1 2
-Nghost:    2193.62 ave 4352 max 932 min
-Histogram: 3 0 0 2 0 0 2 0 0 1
-Neighs:    9700.5 ave 20535 max 3685 min
-Histogram: 5 0 0 0 0 0 0 1 0 2
-
-Total # of neighbors = 77604
-Ave neighs/atom = 56.2348
-Ave special neighs/atom = 0
-Neighbor list builds = 2
-Dangerous builds = 0
-
-# Set up integration parameters
-fix             timeintegration all nve
-fix             thermostat all langevin  3.0000e+02  3.0000e+02  1.0000e+02 81890
-
-# Equilibration (for realistic results, run for 5000000 steps)
-reset_timestep  0
-run             5000
-WARNING: Fix recenter should come after all other integration fixes (../fix_recenter.cpp:131)
-Per MPI rank memory allocation (min/avg/max) = 6.936 | 7.195 | 7.552 Mbytes
-Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
-         300    1233.1611    2866.9109    4100.0721            0            0            0    2866.9109 
-   273.33541    1123.5553    3983.2007     5106.756            0            0            0    3983.2007 
-   293.68078    1207.1857    3319.6601    4526.8458            0            0            0    3319.6601 
-   314.21462    1291.5908    3389.2178    4680.8086            0            0            0    3389.2178 
-   323.77563    1330.8917    3332.9828    4663.8745            0            0            0    3332.9828 
-    302.5902    1243.8082    3461.7692    4705.5774            0            0            0    3461.7692 
-   295.39324    1214.2249    3411.5727    4625.7976            0            0            0    3411.5727 
-   320.52341    1317.5234    3453.1931    4770.7164            0            0            0    3453.1931 
-   312.00777    1282.5195    3403.3443    4685.8638            0            0            0    3403.3443 
-   307.96774    1265.9128    3429.7809    4695.6937            0            0            0    3429.7809 
-   294.75922    1211.6187    3388.8404    4600.4591            0            0            0    3388.8404 
-   311.24567    1279.3869    3514.9603    4794.3472            0            0            0    3514.9603 
-    306.6152    1260.3531    3447.2011    4707.5542            0            0            0    3447.2011 
-   305.23306    1254.6718    3375.5092     4630.181            0            0            0    3375.5092 
-   321.62889    1322.0675    3460.2581    4782.3256            0            0            0    3460.2581 
-   316.37725    1300.4804    3437.0312    4737.5116            0            0            0    3437.0312 
-   322.90522    1327.3139    3389.1262      4716.44            0            0            0    3389.1262 
-   307.57893    1264.3146    3359.8491    4624.1637            0            0            0    3359.8491 
-   302.22607    1242.3115    3406.1711    4648.4826            0            0            0    3406.1711 
-   302.73997    1244.4239    3220.2582    4464.6821            0            0            0    3220.2582 
-   303.66194    1248.2137    3318.4629    4566.6765            0            0            0    3318.4629 
-   308.73862    1269.0815    3369.5894     4638.671            0            0            0    3369.5894 
-   315.60294    1297.2976    3411.2405    4708.5381            0            0            0    3411.2405 
-    310.0113    1274.3129    3360.1054    4634.4183            0            0            0    3360.1054 
-   302.36229    1242.8714    3326.9845    4569.8559            0            0            0    3326.9845 
-   317.78659    1306.2735    3355.4976    4661.7711            0            0            0    3355.4976 
-   302.50479    1243.4571    3317.6846    4561.1417            0            0            0    3317.6846 
-   304.29249    1250.8056    3423.5068    4674.3124            0            0            0    3423.5068 
-   305.99948    1257.8222    3432.9395    4690.7617            0            0            0    3432.9395 
-   309.93363    1273.9937     3393.657    4667.6506            0            0            0     3393.657 
-   316.14884    1299.5415    3463.0636    4762.6051            0            0            0    3463.0636 
-   300.38817    1234.7567    3309.2495    4544.0062            0            0            0    3309.2495 
-   311.05735    1278.6128    3304.4418    4583.0546            0            0            0    3304.4418 
-   311.11872     1278.865    3291.1891    4570.0542            0            0            0    3291.1891 
-   315.74338    1297.8749    3341.3063    4639.1812            0            0            0    3341.3063 
-    297.5658    1223.1552    3316.3862    4539.5414            0            0            0    3316.3862 
-   311.79033    1281.6257    3357.4556    4639.0813            0            0            0    3357.4556 
-   310.93666    1278.1167    3414.7694    4692.8861            0            0            0    3414.7694 
-   307.37298     1263.468    3337.3889    4600.8569            0            0            0    3337.3889 
-   298.84185    1228.4005    3329.6173    4558.0178            0            0            0    3329.6173 
-   310.54684    1276.5143    3351.0852    4627.5995            0            0            0    3351.0852 
-    300.0871    1233.5191    3302.2315    4535.7506            0            0            0    3302.2315 
-   304.69078    1252.4427    3324.2508    4576.6935            0            0            0    3324.2508 
-   313.50714    1288.6827    3330.4088    4619.0915            0            0            0    3330.4088 
-   329.80018    1355.6559      3301.86    4657.5159            0            0            0      3301.86 
-   304.57609    1251.9713    3365.2938    4617.2652            0            0            0    3365.2938 
-   308.73584    1269.0701    3344.4155    4613.4856            0            0            0    3344.4155 
-   306.90951    1261.5629    3304.4698    4566.0327            0            0            0    3304.4698 
-   308.85761    1269.5707    3392.1511    4661.7218            0            0            0    3392.1511 
-   302.78788    1244.6208    3317.0849    4561.7057            0            0            0    3317.0849 
-   321.68092    1322.2813    3321.5755    4643.8568            0            0            0    3321.5755 
-Loop time of 16.3061 on 8 procs for 5000 steps with 1380 atoms
-
-Performance: 52.986 ns/day, 0.453 hours/ns, 306.634 timesteps/s
-69.6% CPU use with 8 MPI tasks x no OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
---------------------------------------------------------------
-Pair    | 2.1872     | 10.542     | 14.607     | 116.7 | 64.65
-Bond    | 0.00044084 | 0.00069669 | 0.00095081 |   0.0 |  0.00
-Neigh   | 0.026948   | 0.15225    | 0.44344    |  42.0 |  0.93
-Comm    | 0.63452    | 4.2953     | 9.49       | 133.9 | 26.34
-Output  | 0.0016391  | 0.012378   | 0.050919   |  13.9 |  0.08
-Modify  | 0.45894    | 1.2107     | 4.4629     | 116.4 |  7.42
-Other   |            | 0.09292    |            |       |  0.57
-
-Nlocal:    172.5 ave 380 max 70 min
-Histogram: 5 0 0 0 0 0 0 1 1 1
-Nghost:    2213 ave 4440 max 903 min
-Histogram: 3 0 0 2 0 0 2 0 0 1
-Neighs:    10042.5 ave 24051 max 3500 min
-Histogram: 5 0 0 0 0 0 0 1 1 1
-
-Total # of neighbors = 80340
-Ave neighs/atom = 58.2174
-Ave special neighs/atom = 0
-Neighbor list builds = 123
-Dangerous builds = 1
-
-# Turn off recentering during production phase
-unfix recentering
-
-# Setup trajectory output
-dump            myDump all custom 100 benzene_water.lammpstrj.gz id type x y z element
-dump_modify     myDump element B W
-dump_modify     myDump sort id
-
-# Production (for realistic results, run for 10000000 steps)
-reset_timestep  0
-run             1000
-Per MPI rank memory allocation (min/avg/max) = 8.232 | 8.492 | 8.851 Mbytes
-Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
-   321.68092    1322.2813    3784.0834    5106.3647            0            0            0    3784.0834 
-   310.59763    1276.7231    3318.3283    4595.0513            0            0            0    3318.3283 
-   303.39445    1247.1141    3324.1191    4571.2332            0            0            0    3324.1191 
-   311.37275    1279.9092    3305.0901    4584.9993            0            0            0    3305.0901 
-   311.29071     1279.572     3248.216     4527.788            0            0            0     3248.216 
-   314.53456     1292.906    3283.4563    4576.3623            0            0            0    3283.4563 
-   316.52595    1301.0916    3258.9171    4560.0087            0            0            0    3258.9171 
-   318.92447    1310.9509    3235.6256    4546.5765            0            0            0    3235.6256 
-   311.79212    1281.6331     3308.099    4589.7321            0            0            0     3308.099 
-   305.52477    1255.8709    3267.6907    4523.5616            0            0            0    3267.6907 
-   301.07457    1237.5782    3206.3997    4443.9779            0            0            0    3206.3997 
-Loop time of 4.44139 on 8 procs for 1000 steps with 1380 atoms
-
-Performance: 38.907 ns/day, 0.617 hours/ns, 225.155 timesteps/s
-60.8% CPU use with 8 MPI tasks x no OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
---------------------------------------------------------------
-Pair    | 0.656      | 2.5078     | 3.5775     |  57.7 | 56.46
-Bond    | 0.00013375 | 0.0001854  | 0.0002377  |   0.0 |  0.00
-Neigh   | 0.0048757  | 0.029188   | 0.090432   |  18.9 |  0.66
-Comm    | 0.51836    | 1.4427     | 2.6285     |  56.9 | 32.48
-Output  | 0.083084   | 0.089199   | 0.10333    |   2.3 |  2.01
-Modify  | 0.0087376  | 0.019705   | 0.038437   |   8.4 |  0.44
-Other   |            | 0.3526     |            |       |  7.94
-
-Nlocal:    172.5 ave 388 max 69 min
-Histogram: 5 0 0 0 0 0 0 2 0 1
-Nghost:    2207.88 ave 4429 max 896 min
-Histogram: 3 0 0 2 0 0 2 0 0 1
-Neighs:    10094.1 ave 24847 max 3403 min
-Histogram: 5 0 0 0 0 0 1 1 0 1
-
-Total # of neighbors = 80753
-Ave neighs/atom = 58.5167
-Ave special neighs/atom = 0
-Neighbor list builds = 23
-Dangerous builds = 0
-
-
-Total wall time: 0:00:21
--- a/examples/PACKAGES/local_density/benzene_water/log.27Oct21.benzene_water.g++.1
+++ b/examples/PACKAGES/local_density/benzene_water/log.27Oct21.benzene_water.g++.1
@ -0,0 +1,300 @@
+LAMMPS (27 Oct 2021)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98)
+  using 1 OpenMP thread(s) per MPI task
+# LAMMPS input file for 26.5% benzene mole fraction solution
+# with 380 benzene and 1000 water molecules,
+# using all possible local density potentials
+# between benzene and water
+#
+# Author: Tanmoy Sanyal, Shell Group, UC Santa Barbara
+#
+# Refer: Sanyal and Shell, JPC-B, 2018, 122 (21), 5678-5693
+
+
+
+# Initialize simulation box
+dimension       3
+boundary        p p p
+units           real
+atom_style      molecular
+
+# Set potential styles
+pair_style      hybrid/overlay table spline 500 local/density
+
+# Read molecule data and set initial velocities
+read_data       benzene_water.data
+Reading data file ...
+  orthogonal box = (-12.865000 -12.865000 -64.829000) to (12.865000 12.865000 64.829000)
+  1 by 1 by 1 MPI processor grid
+  reading atoms ...
+  1380 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+  read_data CPU = 0.006 seconds
+velocity        all create  3.0000e+02 16611 rot yes dist gaussian
+
+# Assign potentials
+pair_coeff          1     1    table          benzene_water.pair.table      PairBB
+WARNING: 33 of 500 force values in table PairBB are inconsistent with -dE/dr.
+WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:465)
+WARNING: 150 of 500 distance values in table 1e-06 with relative error
+WARNING:  over PairBB to re-computed values (src/pair_table.cpp:473)
+pair_coeff          1     2    table          benzene_water.pair.table      PairWW
+WARNING: 61 of 500 force values in table PairWW are inconsistent with -dE/dr.
+WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:465)
+WARNING: 90 of 500 distance values in table 1e-06 with relative error
+WARNING:  over PairWW to re-computed values (src/pair_table.cpp:473)
+pair_coeff          2     2    table          benzene_water.pair.table      PairBW
+WARNING: 108 of 500 force values in table PairBW are inconsistent with -dE/dr.
+WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:465)
+WARNING: 135 of 500 distance values in table 1e-06 with relative error
+WARNING:  over PairBW to re-computed values (src/pair_table.cpp:473)
+pair_coeff          *     *    local/density  benzene_water.localdensity.table
+
+# Recentering during minimization and equilibration
+fix recentering all recenter 0.0 0.0 0.0 units box
+
+# Thermostat & time integration
+timestep        2.0
+thermo          100
+thermo_style    custom temp ke pe etotal ebond eangle edihed evdwl
+
+# Minimization
+minimize        1.e-4 0.0 10000 10000
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- pair_style  local/density  command:
+
+@Article{Sanyal16,
+ author =  {T.Sanyal and M.Scott Shell},
+ title =   {Coarse-grained models using local-density potentials optimized with the relative entropy: Application to implicit solvation},
+ journal = {J.~Chem.~Phys.},
+ year =    2016,
+ DOI = doi.org/10.1063/1.4958629}
+
+@Article{Sanyal18,
+ author =  {T.Sanyal and M.Scott Shell},
+ title =   {Transferable coarse-grained models of liquid-liquid equilibrium using local density potentials optimized with the relative entropy},
+ journal = {J.~Phys.~Chem. B},
+ year =    2018,
+ DOI = doi.org/10.1021/acs.jpcb.7b12446}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (src/min.cpp:187)
+  generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 15.25
+  ghost atom cutoff = 15.25
+  binsize = 7.625, bins = 4 4 18
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair table, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d
+      bin: standard
+  (2) pair local/density, perpetual, copy from (1)
+      attributes: half, newton on
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 8.754 | 8.754 | 8.754 Mbytes
+Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
+         300    1233.1611    2374.6749     3607.836            0            0            0    2374.6749 
+         300    1233.1611    985.54829    2218.7094            0            0            0    985.54829 
+         300    1233.1611    962.66036    2195.8215            0            0            0    962.66036 
+Loop time of 0.812343 on 1 procs for 134 steps with 1380 atoms
+
+99.8% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+Minimization stats:
+  Stopping criterion = energy tolerance
+  Energy initial, next-to-last, final = 
+      2374.67491482358   962.664796664787   962.660357218268
+  Force two-norm initial, final = 263.77519 15.741017
+  Force max component initial, final = 22.412654 7.9360139
+  Final line search alpha, max atom move = 0.014975513 0.11884588
+  Iterations, force evaluations = 134 240
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.78539    | 0.78539    | 0.78539    |   0.0 | 96.68
+Bond    | 2.0149e-05 | 2.0149e-05 | 2.0149e-05 |   0.0 |  0.00
+Neigh   | 0.016759   | 0.016759   | 0.016759   |   0.0 |  2.06
+Comm    | 0.0045     | 0.0045     | 0.0045     |   0.0 |  0.55
+Output  | 2.9402e-05 | 2.9402e-05 | 2.9402e-05 |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 0.005647   |            |       |  0.70
+
+Nlocal:        1380.00 ave        1380 max        1380 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        5832.00 ave        5832 max        5832 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:        78165.0 ave       78165 max       78165 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 78165
+Ave neighs/atom = 56.641304
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 5
+Dangerous builds = 0
+
+# Set up integration parameters
+fix             timeintegration all nve
+fix             thermostat all langevin  3.0000e+02  3.0000e+02  1.0000e+02 81890
+
+# Equilibration (for realistic results, run for 5000000 steps)
+reset_timestep  0
+run             5000
+  generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+WARNING: Fix recenter should come after all other integration fixes (src/fix_recenter.cpp:133)
+Per MPI rank memory allocation (min/avg/max) = 7.629 | 7.629 | 7.629 Mbytes
+Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
+         300    1233.1611    962.66036    2195.8215            0            0            0    962.66036 
+    253.1913    1040.7522     1803.711    2844.4633            0            0            0     1803.711 
+   290.31049     1193.332    2059.0637    3252.3958            0            0            0    2059.0637 
+   299.30778    1230.3157     2140.226    3370.5417            0            0            0     2140.226 
+   309.81524     1273.507    2178.3782    3451.8853            0            0            0    2178.3782 
+   299.79526    1232.3195    2229.9248    3462.2444            0            0            0    2229.9248 
+   299.24909    1230.0745    2260.7129    3490.7874            0            0            0    2260.7129 
+    299.5898     1231.475    2244.2384    3475.7134            0            0            0    2244.2384 
+   297.81223    1224.1682      2320.27    3544.4382            0            0            0      2320.27 
+   301.53975    1239.4903    2277.0431    3516.5334            0            0            0    2277.0431 
+   292.00572    1200.3003    2292.3073    3492.6076            0            0            0    2292.3073 
+   309.19709    1270.9661    2303.6055    3574.5716            0            0            0    2303.6055 
+   297.54933    1223.0876     2304.127    3527.2146            0            0            0     2304.127 
+   303.48106    1247.4702    2303.5673    3551.0375            0            0            0    2303.5673 
+   296.46047    1218.6118    2256.1591    3474.7709            0            0            0    2256.1591 
+    299.4835     1231.038    2280.0452    3511.0832            0            0            0    2280.0452 
+   306.25958    1258.8914    2307.9795    3566.8709            0            0            0    2307.9795 
+   304.67335    1252.3711    2284.8252    3537.1963            0            0            0    2284.8252 
+   298.33637    1226.3227    2289.8499    3516.1726            0            0            0    2289.8499 
+    303.1338    1246.0427    2342.2148    3588.2575            0            0            0    2342.2148 
+   305.86051     1257.251    2341.0106    3598.2616            0            0            0    2341.0106 
+   297.75418    1223.9296    2303.5613    3527.4909            0            0            0    2303.5613 
+   296.79348    1219.9806    2327.5207    3547.5013            0            0            0    2327.5207 
+   307.25403    1262.9791    2288.4219     3551.401            0            0            0    2288.4219 
+   301.26976    1238.3805    2291.2465     3529.627            0            0            0    2291.2465 
+   297.17249    1221.5385    2283.3926    3504.9311            0            0            0    2283.3926 
+   313.99072    1290.6705    2293.9661    3584.6366            0            0            0    2293.9661 
+   301.70804    1240.1821    2331.1694    3571.3515            0            0            0    2331.1694 
+   300.62599    1235.7343    2325.4367     3561.171            0            0            0    2325.4367 
+   292.13495    1200.8316     2315.631    3516.4626            0            0            0     2315.631 
+    313.9981    1290.7008    2286.0536    3576.7545            0            0            0    2286.0536 
+   300.25311    1234.2015    2324.2379    3558.4394            0            0            0    2324.2379 
+    309.3746    1271.6958    2322.2298    3593.9256            0            0            0    2322.2298 
+   300.23041    1234.1082    2332.7521    3566.8603            0            0            0    2332.7521 
+   302.97054    1245.3716    2303.1689    3548.5405            0            0            0    2303.1689 
+   294.77155    1211.6694    2334.5087    3546.1781            0            0            0    2334.5087 
+   296.81476    1220.0681    2322.5932    3542.6613            0            0            0    2322.5932 
+   301.83238    1240.6932    2345.4841    3586.1773            0            0            0    2345.4841 
+    295.0399    1212.7724    2312.3889    3525.1614            0            0            0    2312.3889 
+   300.73565     1236.185    2338.8384    3575.0235            0            0            0    2338.8384 
+   303.02264    1245.5858    2310.0868    3555.6726            0            0            0    2310.0868 
+   302.86404    1244.9339    2332.2001     3577.134            0            0            0    2332.2001 
+   293.77916    1207.5901    2293.2799    3500.8701            0            0            0    2293.2799 
+   299.30072    1230.2867    2317.5065    3547.7933            0            0            0    2317.5065 
+   311.05029    1278.5837    2311.0476    3589.6313            0            0            0    2311.0476 
+   293.25646    1205.4416    2314.7398    3520.1814            0            0            0    2314.7398 
+   310.49018    1276.2814    2337.4909    3613.7723            0            0            0    2337.4909 
+   302.37336    1242.9169    2340.3197    3583.2366            0            0            0    2340.3197 
+   297.06862    1221.1116    2323.9136    3545.0252            0            0            0    2323.9136 
+   300.54817    1235.4144    2315.2405    3550.6549            0            0            0    2315.2405 
+   309.10643    1270.5934    2333.1848    3603.7783            0            0            0    2333.1848 
+Loop time of 15.2696 on 1 procs for 5000 steps with 1380 atoms
+
+Performance: 56.583 ns/day, 0.424 hours/ns, 327.447 timesteps/s
+99.9% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 14.432     | 14.432     | 14.432     |   0.0 | 94.51
+Bond    | 0.00032375 | 0.00032375 | 0.00032375 |   0.0 |  0.00
+Neigh   | 0.41541    | 0.41541    | 0.41541    |   0.0 |  2.72
+Comm    | 0.0975     | 0.0975     | 0.0975     |   0.0 |  0.64
+Output  | 0.0013044  | 0.0013044  | 0.0013044  |   0.0 |  0.01
+Modify  | 0.30336    | 0.30336    | 0.30336    |   0.0 |  1.99
+Other   |            | 0.01973    |            |       |  0.13
+
+Nlocal:        1380.00 ave        1380 max        1380 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        5843.00 ave        5843 max        5843 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:        76949.0 ave       76949 max       76949 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 76949
+Ave neighs/atom = 55.760145
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 121
+Dangerous builds = 1
+
+# Turn off recentering during production phase
+unfix recentering
+
+# Setup trajectory output
+dump            myDump all custom 100 benzene_water.lammpstrj.gz id type x y z element
+dump_modify     myDump element B W
+dump_modify     myDump sort id
+
+# Production (for realistic results, run for 10000000 steps)
+reset_timestep  0
+run             1000
+  generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Per MPI rank memory allocation (min/avg/max) = 9.022 | 9.022 | 9.022 Mbytes
+Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
+   309.10643    1270.5934    2333.1848    3603.7783            0            0            0    2333.1848 
+   300.84572    1236.6375    2331.3493    3567.9868            0            0            0    2331.3493 
+   300.90599    1236.8852    2337.6775    3574.5627            0            0            0    2337.6775 
+   302.77895    1244.5841    2341.7778     3586.362            0            0            0    2341.7778 
+   291.66639    1198.9055    2320.3512    3519.2567            0            0            0    2320.3512 
+    298.7003    1227.8187    2292.8195    3520.6382            0            0            0    2292.8195 
+   301.11163    1237.7305     2310.017    3547.7475            0            0            0     2310.017 
+   305.22515    1254.6393    2315.1355    3569.7748            0            0            0    2315.1355 
+   295.15921    1213.2629     2310.184    3523.4468            0            0            0     2310.184 
+    299.2024    1229.8826    2332.2118    3562.0943            0            0            0    2332.2118 
+   302.80078    1244.6738    2320.3763    3565.0502            0            0            0    2320.3763 
+Loop time of 3.07208 on 1 procs for 1000 steps with 1380 atoms
+
+Performance: 56.249 ns/day, 0.427 hours/ns, 325.512 timesteps/s
+99.9% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 2.8993     | 2.8993     | 2.8993     |   0.0 | 94.37
+Bond    | 6.5327e-05 | 6.5327e-05 | 6.5327e-05 |   0.0 |  0.00
+Neigh   | 0.083502   | 0.083502   | 0.083502   |   0.0 |  2.72
+Comm    | 0.019967   | 0.019967   | 0.019967   |   0.0 |  0.65
+Output  | 0.012268   | 0.012268   | 0.012268   |   0.0 |  0.40
+Modify  | 0.052801   | 0.052801   | 0.052801   |   0.0 |  1.72
+Other   |            | 0.004203   |            |       |  0.14
+
+Nlocal:        1380.00 ave        1380 max        1380 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        5860.00 ave        5860 max        5860 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:        77055.0 ave       77055 max       77055 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 77055
+Ave neighs/atom = 55.836957
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 24
+Dangerous builds = 0
+
+
+Total wall time: 0:00:19
--- a/examples/PACKAGES/local_density/benzene_water/log.27Oct21.benzene_water.g++.4
+++ b/examples/PACKAGES/local_density/benzene_water/log.27Oct21.benzene_water.g++.4
@ -0,0 +1,299 @@
+LAMMPS (27 Oct 2021)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98)
+  using 1 OpenMP thread(s) per MPI task
+# LAMMPS input file for 26.5% benzene mole fraction solution
+# with 380 benzene and 1000 water molecules,
+# using all possible local density potentials
+# between benzene and water
+#
+# Author: Tanmoy Sanyal, Shell Group, UC Santa Barbara
+#
+# Refer: Sanyal and Shell, JPC-B, 2018, 122 (21), 5678-5693
+
+
+
+# Initialize simulation box
+dimension       3
+boundary        p p p
+units           real
+atom_style      molecular
+
+# Set potential styles
+pair_style      hybrid/overlay table spline 500 local/density
+
+# Read molecule data and set initial velocities
+read_data       benzene_water.data
+Reading data file ...
+  orthogonal box = (-12.865000 -12.865000 -64.829000) to (12.865000 12.865000 64.829000)
+  1 by 1 by 4 MPI processor grid
+  reading atoms ...
+  1380 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+  read_data CPU = 0.007 seconds
+velocity        all create  3.0000e+02 16611 rot yes dist gaussian
+
+# Assign potentials
+pair_coeff          1     1    table          benzene_water.pair.table      PairBB
+WARNING: 33 of 500 force values in table PairBB are inconsistent with -dE/dr.
+WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:465)
+WARNING: 150 of 500 distance values in table 1e-06 with relative error
+WARNING:  over PairBB to re-computed values (src/pair_table.cpp:473)
+pair_coeff          1     2    table          benzene_water.pair.table      PairWW
+WARNING: 61 of 500 force values in table PairWW are inconsistent with -dE/dr.
+WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:465)
+WARNING: 90 of 500 distance values in table 1e-06 with relative error
+WARNING:  over PairWW to re-computed values (src/pair_table.cpp:473)
+pair_coeff          2     2    table          benzene_water.pair.table      PairBW
+WARNING: 108 of 500 force values in table PairBW are inconsistent with -dE/dr.
+WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:465)
+WARNING: 135 of 500 distance values in table 1e-06 with relative error
+WARNING:  over PairBW to re-computed values (src/pair_table.cpp:473)
+pair_coeff          *     *    local/density  benzene_water.localdensity.table
+
+# Recentering during minimization and equilibration
+fix recentering all recenter 0.0 0.0 0.0 units box
+
+# Thermostat & time integration
+timestep        2.0
+thermo          100
+thermo_style    custom temp ke pe etotal ebond eangle edihed evdwl
+
+# Minimization
+minimize        1.e-4 0.0 10000 10000
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- pair_style  local/density  command:
+
+@Article{Sanyal16,
+ author =  {T.Sanyal and M.Scott Shell},
+ title =   {Coarse-grained models using local-density potentials optimized with the relative entropy: Application to implicit solvation},
+ journal = {J.~Chem.~Phys.},
+ year =    2016,
+ DOI = doi.org/10.1063/1.4958629}
+
+@Article{Sanyal18,
+ author =  {T.Sanyal and M.Scott Shell},
+ title =   {Transferable coarse-grained models of liquid-liquid equilibrium using local density potentials optimized with the relative entropy},
+ journal = {J.~Phys.~Chem. B},
+ year =    2018,
+ DOI = doi.org/10.1021/acs.jpcb.7b12446}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (src/min.cpp:187)
+  generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 15.25
+  ghost atom cutoff = 15.25
+  binsize = 7.625, bins = 4 4 18
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair table, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d
+      bin: standard
+  (2) pair local/density, perpetual, copy from (1)
+      attributes: half, newton on
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 8.441 | 8.589 | 8.688 Mbytes
+Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
+         300    1233.1611    2374.6749     3607.836            0            0            0    2374.6749 
+         300    1233.1611    1024.8113    2257.9724            0            0            0    1024.8113 
+Loop time of 0.240559 on 4 procs for 74 steps with 1380 atoms
+
+98.5% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+Minimization stats:
+  Stopping criterion = energy tolerance
+  Energy initial, next-to-last, final = 
+      2374.67491482358   1024.89407898645   1024.81130011575
+  Force two-norm initial, final = 263.77519 20.459697
+  Force max component initial, final = 22.412654 8.6082349
+  Final line search alpha, max atom move = 0.027790997 0.23923143
+  Iterations, force evaluations = 74 118
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.15928    | 0.1873     | 0.22814    |   6.5 | 77.86
+Bond    | 3.857e-06  | 4.4012e-06 | 5.496e-06  |   0.0 |  0.00
+Neigh   | 0.00064142 | 0.0028761  | 0.0058864  |   4.2 |  1.20
+Comm    | 0.0040776  | 0.039595   | 0.074187   |  12.6 | 16.46
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 0.01078    |            |       |  4.48
+
+Nlocal:        345.000 ave         664 max         147 min
+Histogram: 2 0 0 0 0 1 0 0 0 1
+Nghost:        2850.50 ave        4438 max        1208 min
+Histogram: 1 0 0 1 0 0 1 0 0 1
+Neighs:        19377.5 ave       37718 max        7456 min
+Histogram: 2 0 0 0 0 1 0 0 0 1
+
+Total # of neighbors = 77510
+Ave neighs/atom = 56.166667
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 3
+Dangerous builds = 0
+
+# Set up integration parameters
+fix             timeintegration all nve
+fix             thermostat all langevin  3.0000e+02  3.0000e+02  1.0000e+02 81890
+
+# Equilibration (for realistic results, run for 5000000 steps)
+reset_timestep  0
+run             5000
+  generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+WARNING: Fix recenter should come after all other integration fixes (src/fix_recenter.cpp:133)
+Per MPI rank memory allocation (min/avg/max) = 7.316 | 7.465 | 7.563 Mbytes
+Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
+         300    1233.1611    1024.8113    2257.9724            0            0            0    1024.8113 
+   263.61917    1083.6164     1866.745    2950.3614            0            0            0     1866.745 
+    296.0253     1216.823    2122.8463    3339.6692            0            0            0    2122.8463 
+   301.93846    1241.1292    2172.9802    3414.1095            0            0            0    2172.9802 
+    293.9491    1208.2887    2205.4892    3413.7779            0            0            0    2205.4892 
+   286.33795    1177.0027    2204.8908    3381.8935            0            0            0    2204.8908 
+   295.48217    1214.5904    2230.8849    3445.4753            0            0            0    2230.8849 
+   293.88908    1208.0419    2218.7563    3426.7982            0            0            0    2218.7563 
+   295.13798    1213.1756    2277.4515    3490.6271            0            0            0    2277.4515 
+   290.39538     1193.681    2273.4385    3467.1195            0            0            0    2273.4385 
+   297.56782    1223.1635    2268.7182    3491.8817            0            0            0    2268.7182 
+   306.45578    1259.6978    2289.1507    3548.8486            0            0            0    2289.1507 
+   308.54582     1268.289    2284.8514    3553.1404            0            0            0    2284.8514 
+   302.17353    1242.0955    2262.5577    3504.6532            0            0            0    2262.5577 
+   295.30087    1213.8452    2315.8853    3529.7305            0            0            0    2315.8853 
+   308.59197    1268.4787    2291.8314    3560.3101            0            0            0    2291.8314 
+   297.75618    1223.9378    2287.2003    3511.1381            0            0            0    2287.2003 
+   303.43395    1247.2765    2297.7158    3544.9923            0            0            0    2297.7158 
+   307.16233    1262.6021    2255.9769    3518.5791            0            0            0    2255.9769 
+   301.34428    1238.6868     2284.416    3523.1028            0            0            0     2284.416 
+   295.43209    1214.3846    2294.1043    3508.4889            0            0            0    2294.1043 
+   287.86904    1183.2963    2257.0204    3440.3168            0            0            0    2257.0204 
+    297.2661    1221.9233    2251.4194    3473.3428            0            0            0    2251.4194 
+   298.90221    1228.6486     2261.834    3490.4826            0            0            0     2261.834 
+   288.07202    1184.1307    2284.1918    3468.3225            0            0            0    2284.1918 
+   300.41201    1234.8547    2303.9573     3538.812            0            0            0    2303.9573 
+   283.91279     1167.034    2329.7936    3496.8277            0            0            0    2329.7936 
+   297.27507    1221.9602    2337.0516    3559.0118            0            0            0    2337.0516 
+   296.22263    1217.6341    2335.6424    3553.2765            0            0            0    2335.6424 
+   296.13784    1217.2856    2364.7034     3581.989            0            0            0    2364.7034 
+   308.17642    1266.7706    2320.2753    3587.0459            0            0            0    2320.2753 
+   310.26592    1275.3596    2301.9318    3577.2914            0            0            0    2301.9318 
+   292.97391    1204.2801    2289.8116    3494.0917            0            0            0    2289.8116 
+   294.81231    1211.8369    2315.0388    3526.8757            0            0            0    2315.0388 
+   298.66155    1227.6594    2317.2844    3544.9437            0            0            0    2317.2844 
+   302.77939    1244.5859    2301.2063    3545.7922            0            0            0    2301.2063 
+   291.47597    1198.1228    2285.1757    3483.2985            0            0            0    2285.1757 
+   286.19045    1176.3964    2265.2665    3441.6629            0            0            0    2265.2665 
+   295.58144    1214.9984    2272.3165     3487.315            0            0            0    2272.3165 
+   283.86988    1166.8577    2320.6142    3487.4719            0            0            0    2320.6142 
+    300.0576    1233.3979    2330.8962    3564.2941            0            0            0    2330.8962 
+   299.86413    1232.6026    2321.2281    3553.8308            0            0            0    2321.2281 
+   292.79017    1203.5248    2334.2308    3537.7557            0            0            0    2334.2308 
+    291.5027    1198.2327    2335.2119    3533.4446            0            0            0    2335.2119 
+   299.55471    1231.3307    2332.5216    3563.8524            0            0            0    2332.5216 
+   293.29613    1205.6046    2295.3263    3500.9309            0            0            0    2295.3263 
+   303.13151    1246.0333    2310.0548    3556.0881            0            0            0    2310.0548 
+   298.83954     1228.391    2297.3117    3525.7027            0            0            0    2297.3117 
+   297.44775      1222.67    2307.2483    3529.9183            0            0            0    2307.2483 
+   309.59874    1272.6171    2309.2439     3581.861            0            0            0    2309.2439 
+   307.47844    1263.9015     2274.998    3538.8995            0            0            0     2274.998 
+Loop time of 11.2235 on 4 procs for 5000 steps with 1380 atoms
+
+Performance: 76.982 ns/day, 0.312 hours/ns, 445.495 timesteps/s
+98.5% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 7.1444     | 8.5074     | 10.534     |  44.9 | 75.80
+Bond    | 0.00017048 | 0.00020672 | 0.00030488 |   0.0 |  0.00
+Neigh   | 0.026174   | 0.12108    | 0.26052    |  28.2 |  1.08
+Comm    | 0.21788    | 1.8597     | 3.3375     |  81.2 | 16.57
+Output  | 0.0008989  | 0.0069895  | 0.021647   |  10.2 |  0.06
+Modify  | 0.19418    | 0.7044     | 2.1378     |  98.6 |  6.28
+Other   |            | 0.02368    |            |       |  0.21
+
+Nlocal:        345.000 ave         678 max         148 min
+Histogram: 2 0 0 0 1 0 0 0 0 1
+Nghost:        2854.25 ave        4464 max        1181 min
+Histogram: 1 0 0 1 0 0 1 0 0 1
+Neighs:        19366.8 ave       38533 max        7481 min
+Histogram: 2 0 0 0 0 1 0 0 0 1
+
+Total # of neighbors = 77467
+Ave neighs/atom = 56.135507
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 121
+Dangerous builds = 1
+
+# Turn off recentering during production phase
+unfix recentering
+
+# Setup trajectory output
+dump            myDump all custom 100 benzene_water.lammpstrj.gz id type x y z element
+dump_modify     myDump element B W
+dump_modify     myDump sort id
+
+# Production (for realistic results, run for 10000000 steps)
+reset_timestep  0
+run             1000
+  generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Per MPI rank memory allocation (min/avg/max) = 8.640 | 8.791 | 8.894 Mbytes
+Temp KinEng PotEng TotEng E_bond E_angle E_dihed E_vdwl 
+   307.47844    1263.9015     2274.998    3538.8995            0            0            0     2274.998 
+   309.46142    1272.0526    2274.8499    3546.9026            0            0            0    2274.8499 
+   300.70977    1236.0787    2301.0588    3537.1374            0            0            0    2301.0588 
+   300.53659    1235.3668    2316.1008    3551.4675            0            0            0    2316.1008 
+   300.48582    1235.1581    2296.3009     3531.459            0            0            0    2296.3009 
+    299.2618    1230.1267    2325.7501    3555.8768            0            0            0    2325.7501 
+   303.00905    1245.5299    2321.8238    3567.3537            0            0            0    2321.8238 
+   300.07018    1233.4496    2339.2833    3572.7329            0            0            0    2339.2833 
+   304.20292    1250.4374    2353.1018    3603.5392            0            0            0    2353.1018 
+   304.19487    1250.4043    2334.5087     3584.913            0            0            0    2334.5087 
+   294.24283    1209.4961    2335.0535    3544.5496            0            0            0    2335.0535 
+Loop time of 2.90512 on 4 procs for 1000 steps with 1380 atoms
+
+Performance: 59.481 ns/day, 0.403 hours/ns, 344.220 timesteps/s
+98.4% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 1.8627     | 2.2082     | 2.7289     |  22.6 | 76.01
+Bond    | 4.042e-05  | 5.3677e-05 | 8.4044e-05 |   0.0 |  0.00
+Neigh   | 0.0066184  | 0.030172   | 0.064523   |  13.9 |  1.04
+Comm    | 0.05914    | 0.51145    | 0.86887    |  40.7 | 17.61
+Output  | 0.0057814  | 0.0073478  | 0.011158   |   2.6 |  0.25
+Modify  | 0.0085337  | 0.020869   | 0.042248   |   9.4 |  0.72
+Other   |            | 0.127      |            |       |  4.37
+
+Nlocal:        345.000 ave         682 max         147 min
+Histogram: 2 0 0 0 1 0 0 0 0 1
+Nghost:        2836.25 ave        4427 max        1175 min
+Histogram: 1 0 0 1 0 0 1 0 0 1
+Neighs:        19249.8 ave       38683 max        7433 min
+Histogram: 2 0 0 0 1 0 0 0 0 1
+
+Total # of neighbors = 76999
+Ave neighs/atom = 55.796377
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 23
+Dangerous builds = 0
+
+
+Total wall time: 0:00:14
--- a/examples/PACKAGES/local_density/methanol_implicit_water/in.methanol_implicit_water
+++ b/examples/PACKAGES/local_density/methanol_implicit_water/in.methanol_implicit_water
--- a/examples/PACKAGES/local_density/methanol_implicit_water/log.04Sep19.g++.1
+++ b/examples/PACKAGES/local_density/methanol_implicit_water/log.04Sep19.g++.1
@ -1,226 +0,0 @@
-LAMMPS (7 Aug 2019)
-# LAMMPS input file for 50.0% methanol mole fraction solution
-# with 2500 methanol molecules in implicit water.
-#
-#
-# Author: David Rosenberger, van der Vegt Group, TU Darmstadt
-#
-# Refer: Rosenberger, Sanyal, Shell, van der Vegt, J. Chem. Theory Comput. 15, 2881-2895 (2019)
-
-
-# Initialize simulation box
-dimension       3
-boundary        p p p
-units           real
-atom_style      molecular
-
-# Set potential styles
-pair_style      hybrid/overlay table spline 500 local/density
-
-# Read molecule data and set initial velocities
-read_data       methanol_implicit_water.data
-  orthogonal box = (-31.123 -31.123 -31.123) to (31.123 31.123 31.123)
-  2 by 2 by 2 MPI processor grid
-  reading atoms ...
-  2500 atoms
-  0 = max # of 1-2 neighbors
-  0 = max # of 1-3 neighbors
-  0 = max # of 1-4 neighbors
-  1 = max # of special neighbors
-  special bonds CPU = 0.00063014 secs
-  read_data CPU = 0.00599909 secs
-velocity        all create  3.0000e+02 12142 rot yes dist gaussian
-
-# Assign potentials
-pair_coeff          1     1 table         methanol_implicit_water.pair.table PairMM
-WARNING: 93 of 500 force values in table are inconsistent with -dE/dr.
-  Should only be flagged at inflection points (../pair_table.cpp:483)
-WARNING: 254 of 500 distance values in table with relative error
-  over 1e-06 to re-computed values (../pair_table.cpp:492)
-pair_coeff          *     * local/density methanol_implicit_water.localdensity.table
-
-
-
-
-#Recentering during minimization and equilibration
-fix recentering all recenter 0.0 0.0 0.0 units box
-
-#Thermostat & time integration
-timestep        1.0
-thermo          100
-thermo_style    custom etotal ke pe temp evdwl
-
-#minimization
-minimize        1.e-4 0.0 1000 1000
-WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (../min.cpp:168)
-Neighbor list info ...
-  update every 1 steps, delay 0 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 17
-  ghost atom cutoff = 17
-  binsize = 8.5, bins = 8 8 8
-  2 neighbor lists, perpetual/occasional/extra = 2 0 0
-  (1) pair table, perpetual
-      attributes: half, newton on
-      pair build: half/bin/newton
-      stencil: half/bin/3d/newton
-      bin: standard
-  (2) pair local/density, perpetual, copy from (1)
-      attributes: half, newton on
-      pair build: copy
-      stencil: none
-      bin: none
-Per MPI rank memory allocation (min/avg/max) = 7.411 | 7.411 | 7.412 Mbytes
-TotEng KinEng PotEng Temp E_vdwl 
-   1470.3564    2234.7133   -764.35689          300   -764.35689 
-   46.496766    2234.7133   -2188.2165          300   -2188.2165 
-   7.9030246    2234.7133   -2226.8103          300   -2226.8103 
-Loop time of 0.463996 on 8 procs for 121 steps with 2500 atoms
-
-91.4% CPU use with 8 MPI tasks x no OpenMP threads
-
-Minimization stats:
-  Stopping criterion = linesearch alpha is zero
-  Energy initial, next-to-last, final = 
-        -764.356892369     -2227.85589084     -2226.81026984
-  Force two-norm initial, final = 134.911 3.83896
-  Force max component initial, final = 14.1117 1.07422
-  Final line search alpha, max atom move = 5.06747e-10 5.44356e-10
-  Iterations, force evaluations = 121 154
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
---------------------------------------------------------------
-Pair    | 0.41442    | 0.41976    | 0.42434    |   0.5 | 90.47
-Bond    | 1.1683e-05 | 2.0713e-05 | 3.5048e-05 |   0.0 |  0.00
-Neigh   | 0.0084722  | 0.0090862  | 0.010038   |   0.5 |  1.96
-Comm    | 0.022712   | 0.028157   | 0.034072   |   1.9 |  6.07
-Output  | 3.1948e-05 | 3.6925e-05 | 6.6996e-05 |   0.0 |  0.01
-Modify  | 0          | 0          | 0          |   0.0 |  0.00
-Other   |            | 0.006937   |            |       |  1.50
-
-Nlocal:    312.5 ave 333 max 299 min
-Histogram: 2 2 0 0 1 0 2 0 0 1
-Nghost:    2546 ave 2580 max 2517 min
-Histogram: 1 1 0 3 0 1 0 0 0 2
-Neighs:    33215.4 ave 37251 max 29183 min
-Histogram: 1 0 0 1 2 2 0 1 0 1
-
-Total # of neighbors = 265723
-Ave neighs/atom = 106.289
-Ave special neighs/atom = 0
-Neighbor list builds = 6
-Dangerous builds = 0
-
-#set up integration parameters
-fix             timeintegration all nve
-fix             thermostat all langevin  3.0000e+02  3.0000e+02  1.0000e+02 59915
-
-#Equilibration (for realistic results, run for 2000000  steps)
-reset_timestep  0
-thermo          200
-thermo_style    custom etotal ke pe temp evdwl
-
-#run equilibration
-run             2000
-WARNING: Fix recenter should come after all other integration fixes (../fix_recenter.cpp:131)
-Per MPI rank memory allocation (min/avg/max) = 6.286 | 6.286 | 6.287 Mbytes
-TotEng KinEng PotEng Temp E_vdwl 
-   177.26822    2234.7133   -2057.4451          300   -2057.4451 
-   736.24287    2151.2608   -1415.0179    288.79688   -1415.0179 
-   963.07617    2090.6433   -1127.5671    280.65926   -1127.5671 
-   1148.9049    2173.1327   -1024.2279    291.73309   -1024.2279 
-   1303.6409    2279.8586   -976.21767    306.06055   -976.21767 
-     1355.42    2281.0383   -925.61826    306.21892   -925.61826 
-   1394.5206    2276.2093   -881.68863    305.57064   -881.68863 
-   1346.9764    2215.2973   -868.32091     297.3935   -868.32091 
-   1381.3654    2248.8061   -867.44063    301.89189   -867.44063 
-   1315.8059    2189.3193   -873.51332    293.90606   -873.51332 
-   1314.4456    2209.7431   -895.29752    296.64787   -895.29752 
-Loop time of 6.38989 on 8 procs for 2000 steps with 2500 atoms
-
-Performance: 27.043 ns/day, 0.887 hours/ns, 312.994 timesteps/s
-80.5% CPU use with 8 MPI tasks x no OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
---------------------------------------------------------------
-Pair    | 5.2693     | 5.3572     | 5.457      |   2.1 | 83.84
-Bond    | 0.00028825 | 0.00033835 | 0.00039148 |   0.0 |  0.01
-Neigh   | 0.0296     | 0.032337   | 0.035071   |   0.9 |  0.51
-Comm    | 0.64679    | 0.73397    | 0.80847    |   5.2 | 11.49
-Output  | 0.00033498 | 0.00051582 | 0.0015228  |   0.0 |  0.01
-Modify  | 0.16395    | 0.18919    | 0.21056    |   3.9 |  2.96
-Other   |            | 0.07636    |            |       |  1.19
-
-Nlocal:    312.5 ave 337 max 295 min
-Histogram: 2 2 0 1 0 0 0 1 1 1
-Nghost:    2551.62 ave 2582 max 2525 min
-Histogram: 2 1 0 0 1 1 1 0 1 1
-Neighs:    33241.8 ave 37659 max 29705 min
-Histogram: 2 0 0 2 2 0 0 0 1 1
-
-Total # of neighbors = 265934
-Ave neighs/atom = 106.374
-Ave special neighs/atom = 0
-Neighbor list builds = 21
-Dangerous builds = 0
-
-#turn off recentering during production run
-unfix recentering
-
-
-#setup trajectory output
-dump            myDump all custom 100 methanol_implicit_water.lammpstrj.gz id type x y z element
-dump_modify     myDump element M
-dump_modify     myDump sort id
-
-#run production (for realistic results, run for 10000000 steps)
-reset_timestep  0
-thermo          1000
-thermo_style    custom etotal ke pe temp  evdwl
-run             10000
-Per MPI rank memory allocation (min/avg/max) = 7.588 | 7.589 | 7.589 Mbytes
-TotEng KinEng PotEng Temp E_vdwl 
-   1442.5428    2209.7431   -767.20027    296.64787   -767.20027 
-   1391.8624    2262.6889   -870.82656     303.7556   -870.82656 
-    1375.914    2244.6176    -868.7036     301.3296    -868.7036 
-   1345.9064    2227.2324   -881.32599    298.99573   -881.32599 
-   1379.2334    2278.1156   -898.88222    305.82657   -898.88222 
-   1389.7928    2255.8062   -866.01341    302.83163   -866.01341 
-   1380.4549    2258.2108   -877.75582    303.15443   -877.75582 
-   1380.8489    2256.9432   -876.09428    302.98426   -876.09428 
-   1326.5151    2225.7408   -899.22577    298.79549   -899.22577 
-   1376.6025    2253.0128   -876.41028    302.45662   -876.41028 
-   1331.0008    2218.1033   -887.10258    297.77019   -887.10258 
-Loop time of 25.4591 on 8 procs for 10000 steps with 2500 atoms
-
-Performance: 33.937 ns/day, 0.707 hours/ns, 392.787 timesteps/s
-89.3% CPU use with 8 MPI tasks x no OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
---------------------------------------------------------------
-Pair    | 21.635     | 21.916     | 22.237     |   3.9 | 86.08
-Bond    | 0.0011308  | 0.0013149  | 0.0016932  |   0.5 |  0.01
-Neigh   | 0.14593    | 0.15675    | 0.16667    |   1.9 |  0.62
-Comm    | 1.3789     | 1.7502     | 1.9558     |  13.7 |  6.87
-Output  | 0.34664    | 0.82927    | 1.2013     |  32.8 |  3.26
-Modify  | 0.24904    | 0.25842    | 0.26907    |   1.2 |  1.02
-Other   |            | 0.5475     |            |       |  2.15
-
-Nlocal:    312.5 ave 327 max 298 min
-Histogram: 2 0 0 1 1 0 1 1 1 1
-Nghost:    2575 ave 2601 max 2559 min
-Histogram: 2 0 3 1 0 0 0 0 1 1
-Neighs:    33223.2 ave 35920 max 30303 min
-Histogram: 1 1 1 1 0 1 0 0 0 3
-
-Total # of neighbors = 265786
-Ave neighs/atom = 106.314
-Ave special neighs/atom = 0
-Neighbor list builds = 103
-Dangerous builds = 0
-
-
-Total wall time: 0:00:32
--- a/examples/PACKAGES/local_density/methanol_implicit_water/log.27Oct21.methanol_implicit_water.g++.1
+++ b/examples/PACKAGES/local_density/methanol_implicit_water/log.27Oct21.methanol_implicit_water.g++.1
@ -0,0 +1,259 @@
+LAMMPS (27 Oct 2021)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98)
+  using 1 OpenMP thread(s) per MPI task
+# LAMMPS input file for 50.0% methanol mole fraction solution
+# with 2500 methanol molecules in implicit water.
+#
+#
+# Author: David Rosenberger, van der Vegt Group, TU Darmstadt
+#
+# Refer: Rosenberger, Sanyal, Shell, van der Vegt, J. Chem. Theory Comput. 15, 2881-2895 (2019)
+
+
+# Initialize simulation box
+dimension       3
+boundary        p p p
+units           real
+atom_style      molecular
+
+# Set potential styles
+pair_style      hybrid/overlay table spline 500 local/density
+
+# Read molecule data and set initial velocities
+read_data       methanol_implicit_water.data
+Reading data file ...
+  orthogonal box = (-31.123000 -31.123000 -31.123000) to (31.123000 31.123000 31.123000)
+  1 by 1 by 1 MPI processor grid
+  reading atoms ...
+  2500 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.001 seconds
+  read_data CPU = 0.016 seconds
+velocity        all create  3.0000e+02 12142 rot yes dist gaussian
+
+# Assign potentials
+pair_coeff          1     1 table         methanol_implicit_water.pair.table PairMM
+WARNING: 93 of 500 force values in table PairMM are inconsistent with -dE/dr.
+WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:465)
+WARNING: 254 of 500 distance values in table 1e-06 with relative error
+WARNING:  over PairMM to re-computed values (src/pair_table.cpp:473)
+pair_coeff          *     * local/density methanol_implicit_water.localdensity.table
+
+
+
+
+#Recentering during minimization and equilibration
+fix recentering all recenter 0.0 0.0 0.0 units box
+
+#Thermostat & time integration
+timestep        1.0
+thermo          100
+thermo_style    custom etotal ke pe temp evdwl
+
+#minimization
+minimize        1.e-4 0.0 1000 1000
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- pair_style  local/density  command:
+
+@Article{Sanyal16,
+ author =  {T.Sanyal and M.Scott Shell},
+ title =   {Coarse-grained models using local-density potentials optimized with the relative entropy: Application to implicit solvation},
+ journal = {J.~Chem.~Phys.},
+ year =    2016,
+ DOI = doi.org/10.1063/1.4958629}
+
+@Article{Sanyal18,
+ author =  {T.Sanyal and M.Scott Shell},
+ title =   {Transferable coarse-grained models of liquid-liquid equilibrium using local density potentials optimized with the relative entropy},
+ journal = {J.~Phys.~Chem. B},
+ year =    2018,
+ DOI = doi.org/10.1021/acs.jpcb.7b12446}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (src/min.cpp:187)
+  generated 0 of 0 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 17
+  ghost atom cutoff = 17
+  binsize = 8.5, bins = 8 8 8
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair table, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d
+      bin: standard
+  (2) pair local/density, perpetual, copy from (1)
+      attributes: half, newton on
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 9.535 | 9.535 | 9.535 Mbytes
+TotEng KinEng PotEng Temp E_vdwl 
+   1283.8556    2234.7133   -950.85771          300   -950.85771 
+  -10.187232    2234.7133   -2244.9005          300   -2244.9005 
+  -124.79406    2234.7133   -2359.5074          300   -2359.5074 
+   -126.7619    2234.7133   -2361.4752          300   -2361.4752 
+Loop time of 3.74581 on 1 procs for 205 steps with 2500 atoms
+
+99.5% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+Minimization stats:
+  Stopping criterion = energy tolerance
+  Energy initial, next-to-last, final = 
+     -950.857712502514  -2361.24417962983  -2361.47519428972
+  Force two-norm initial, final = 135.25170 2.8038329
+  Force max component initial, final = 14.083102 1.1154133
+  Final line search alpha, max atom move = 0.16981022 0.18940857
+  Iterations, force evaluations = 205 223
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 3.5678     | 3.5678     | 3.5678     |   0.0 | 95.25
+Bond    | 7.5831e-05 | 7.5831e-05 | 7.5831e-05 |   0.0 |  0.00
+Neigh   | 0.12962    | 0.12962    | 0.12962    |   0.0 |  3.46
+Comm    | 0.019204   | 0.019204   | 0.019204   |   0.0 |  0.51
+Output  | 0.00023948 | 0.00023948 | 0.00023948 |   0.0 |  0.01
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 0.02886    |            |       |  0.77
+
+Nlocal:        2500.00 ave        2500 max        2500 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        6729.00 ave        6729 max        6729 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:        265637.0 ave      265637 max      265637 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 265637
+Ave neighs/atom = 106.25480
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 11
+Dangerous builds = 0
+
+#set up integration parameters
+fix             timeintegration all nve
+fix             thermostat all langevin  3.0000e+02  3.0000e+02  1.0000e+02 59915
+
+#Equilibration (for realistic results, run for 2000000  steps)
+reset_timestep  0
+thermo          200
+thermo_style    custom etotal ke pe temp evdwl
+
+#run equilibration
+run             2000
+  generated 0 of 0 mixed pair_coeff terms from geometric mixing rule
+WARNING: Fix recenter should come after all other integration fixes (src/fix_recenter.cpp:133)
+Per MPI rank memory allocation (min/avg/max) = 8.410 | 8.410 | 8.410 Mbytes
+TotEng KinEng PotEng Temp E_vdwl 
+   -126.7619    2234.7133   -2361.4752          300   -2361.4752 
+   517.05047    2015.8636   -1498.8131    270.62043   -1498.8131 
+   931.78263    2135.4332   -1203.6506     286.6721   -1203.6506 
+   1162.6209    2242.1662   -1079.5453    301.00051   -1079.5453 
+   1164.2129    2211.6204   -1047.4075    296.89989   -1047.4075 
+   1258.0085    2286.5942   -1028.5857    306.96477   -1028.5857 
+   1231.1937     2200.814   -969.62032    295.44917   -969.62032 
+   1251.2144    2245.0533   -993.83885     301.3881   -993.83885 
+   1237.2495    2239.8802   -1002.6307    300.69363   -1002.6307 
+   1232.3342    2224.3415   -992.00722    298.60763   -992.00722 
+   1235.3228     2197.191   -961.86817     294.9628   -961.86817 
+Loop time of 23.6478 on 1 procs for 2000 steps with 2500 atoms
+
+Performance: 7.307 ns/day, 3.284 hours/ns, 84.575 timesteps/s
+99.5% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 22.797     | 22.797     | 22.797     |   0.0 | 96.40
+Bond    | 0.00070412 | 0.00070412 | 0.00070412 |   0.0 |  0.00
+Neigh   | 0.2249     | 0.2249     | 0.2249     |   0.0 |  0.95
+Comm    | 0.12259    | 0.12259    | 0.12259    |   0.0 |  0.52
+Output  | 0.00088925 | 0.00088925 | 0.00088925 |   0.0 |  0.00
+Modify  | 0.46447    | 0.46447    | 0.46447    |   0.0 |  1.96
+Other   |            | 0.03711    |            |       |  0.16
+
+Nlocal:        2500.00 ave        2500 max        2500 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        6752.00 ave        6752 max        6752 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:        265940.0 ave      265940 max      265940 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 265940
+Ave neighs/atom = 106.37600
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 20
+Dangerous builds = 0
+
+#turn off recentering during production run
+unfix recentering
+
+
+#setup trajectory output
+dump            myDump all custom 100 methanol_implicit_water.lammpstrj.gz id type x y z element
+dump_modify     myDump element M
+dump_modify     myDump sort id
+
+#run production (for realistic results, run for 10000000 steps)
+reset_timestep  0
+thermo          1000
+thermo_style    custom etotal ke pe temp  evdwl
+run             10000
+  generated 0 of 0 mixed pair_coeff terms from geometric mixing rule
+Per MPI rank memory allocation (min/avg/max) = 9.918 | 9.918 | 9.918 Mbytes
+TotEng KinEng PotEng Temp E_vdwl 
+   1235.3228     2197.191   -961.86817     294.9628   -961.86817 
+   1289.8463    2236.1425   -946.29622    300.19186   -946.29622 
+   1348.0825    2305.0295   -956.94703    309.43963   -956.94703 
+   1279.5478    2241.1582   -961.61041    300.86521   -961.61041 
+   1231.8597    2201.9591   -970.09949    295.60291   -970.09949 
+   1277.3424    2221.3696   -944.02725    298.20867   -944.02725 
+   1296.0116    2222.0998   -926.08818     298.3067   -926.08818 
+   1266.2849    2206.3727   -940.08782     296.1954   -940.08782 
+   1313.2808    2260.5077   -947.22683    303.46278   -947.22683 
+   1309.3076    2234.3895   -925.08198    299.95654   -925.08198 
+   1275.9792    2221.3037   -945.32449    298.19982   -945.32449 
+Loop time of 67.3224 on 1 procs for 10000 steps with 2500 atoms
+
+Performance: 12.834 ns/day, 1.870 hours/ns, 148.539 timesteps/s
+99.4% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 64.476     | 64.476     | 64.476     |   0.0 | 95.77
+Bond    | 0.0014504  | 0.0014504  | 0.0014504  |   0.0 |  0.00
+Neigh   | 0.71333    | 0.71333    | 0.71333    |   0.0 |  1.06
+Comm    | 0.32846    | 0.32846    | 0.32846    |   0.0 |  0.49
+Output  | 0.46997    | 0.46997    | 0.46997    |   0.0 |  0.70
+Modify  | 1.2336     | 1.2336     | 1.2336     |   0.0 |  1.83
+Other   |            | 0.09996    |            |       |  0.15
+
+Nlocal:        2500.00 ave        2500 max        2500 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        6662.00 ave        6662 max        6662 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:        265774.0 ave      265774 max      265774 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 265774
+Ave neighs/atom = 106.30960
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 104
+Dangerous builds = 0
+
+
+Total wall time: 0:01:34
--- a/examples/PACKAGES/local_density/methanol_implicit_water/log.27Oct21.methanol_implicit_water.g++.4
+++ b/examples/PACKAGES/local_density/methanol_implicit_water/log.27Oct21.methanol_implicit_water.g++.4
@ -0,0 +1,259 @@
+LAMMPS (27 Oct 2021)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:98)
+  using 1 OpenMP thread(s) per MPI task
+# LAMMPS input file for 50.0% methanol mole fraction solution
+# with 2500 methanol molecules in implicit water.
+#
+#
+# Author: David Rosenberger, van der Vegt Group, TU Darmstadt
+#
+# Refer: Rosenberger, Sanyal, Shell, van der Vegt, J. Chem. Theory Comput. 15, 2881-2895 (2019)
+
+
+# Initialize simulation box
+dimension       3
+boundary        p p p
+units           real
+atom_style      molecular
+
+# Set potential styles
+pair_style      hybrid/overlay table spline 500 local/density
+
+# Read molecule data and set initial velocities
+read_data       methanol_implicit_water.data
+Reading data file ...
+  orthogonal box = (-31.123000 -31.123000 -31.123000) to (31.123000 31.123000 31.123000)
+  1 by 2 by 2 MPI processor grid
+  reading atoms ...
+  2500 atoms
+Finding 1-2 1-3 1-4 neighbors ...
+  special bond factors lj:    0        0        0       
+  special bond factors coul:  0        0        0       
+     0 = max # of 1-2 neighbors
+     0 = max # of 1-3 neighbors
+     0 = max # of 1-4 neighbors
+     1 = max # of special neighbors
+  special bonds CPU = 0.000 seconds
+  read_data CPU = 0.005 seconds
+velocity        all create  3.0000e+02 12142 rot yes dist gaussian
+
+# Assign potentials
+pair_coeff          1     1 table         methanol_implicit_water.pair.table PairMM
+WARNING: 93 of 500 force values in table PairMM are inconsistent with -dE/dr.
+WARNING:  Should only be flagged at inflection points (src/pair_table.cpp:465)
+WARNING: 254 of 500 distance values in table 1e-06 with relative error
+WARNING:  over PairMM to re-computed values (src/pair_table.cpp:473)
+pair_coeff          *     * local/density methanol_implicit_water.localdensity.table
+
+
+
+
+#Recentering during minimization and equilibration
+fix recentering all recenter 0.0 0.0 0.0 units box
+
+#Thermostat & time integration
+timestep        1.0
+thermo          100
+thermo_style    custom etotal ke pe temp evdwl
+
+#minimization
+minimize        1.e-4 0.0 1000 1000
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- pair_style  local/density  command:
+
+@Article{Sanyal16,
+ author =  {T.Sanyal and M.Scott Shell},
+ title =   {Coarse-grained models using local-density potentials optimized with the relative entropy: Application to implicit solvation},
+ journal = {J.~Chem.~Phys.},
+ year =    2016,
+ DOI = doi.org/10.1063/1.4958629}
+
+@Article{Sanyal18,
+ author =  {T.Sanyal and M.Scott Shell},
+ title =   {Transferable coarse-grained models of liquid-liquid equilibrium using local density potentials optimized with the relative entropy},
+ journal = {J.~Phys.~Chem. B},
+ year =    2018,
+ DOI = doi.org/10.1021/acs.jpcb.7b12446}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+WARNING: Using 'neigh_modify every 1 delay 0 check yes' setting during minimization (src/min.cpp:187)
+  generated 0 of 0 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 17
+  ghost atom cutoff = 17
+  binsize = 8.5, bins = 8 8 8
+  2 neighbor lists, perpetual/occasional/extra = 2 0 0
+  (1) pair table, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d
+      bin: standard
+  (2) pair local/density, perpetual, copy from (1)
+      attributes: half, newton on
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 7.855 | 7.855 | 7.855 Mbytes
+TotEng KinEng PotEng Temp E_vdwl 
+   1283.8556    2234.7133   -950.85771          300   -950.85771 
+  -10.187232    2234.7133   -2244.9005          300   -2244.9005 
+   -124.3661    2234.7133   -2359.0794          300   -2359.0794 
+   -146.7158    2234.7133   -2381.4291          300   -2381.4291 
+Loop time of 0.528503 on 4 procs for 244 steps with 2500 atoms
+
+99.7% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+Minimization stats:
+  Stopping criterion = energy tolerance
+  Energy initial, next-to-last, final = 
+     -950.857712502527   -2381.2294195605  -2381.42909821383
+  Force two-norm initial, final = 135.25170 2.3117934
+  Force max component initial, final = 14.083102 0.60833889
+  Final line search alpha, max atom move = 0.18347073 0.11161238
+  Iterations, force evaluations = 244 278
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.48518    | 0.48843    | 0.49223    |   0.4 | 92.42
+Bond    | 1.0084e-05 | 1.0861e-05 | 1.1483e-05 |   0.0 |  0.00
+Neigh   | 0.018199   | 0.019153   | 0.020036   |   0.5 |  3.62
+Comm    | 0.010229   | 0.014832   | 0.018994   |   2.6 |  2.81
+Output  | 3.7985e-05 | 4.2069e-05 | 5.3874e-05 |   0.0 |  0.01
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 0.006032   |            |       |  1.14
+
+Nlocal:        625.000 ave         638 max         618 min
+Histogram: 2 0 0 0 1 0 0 0 0 1
+Nghost:        3613.75 ave        3640 max        3580 min
+Histogram: 1 0 0 0 1 0 0 0 1 1
+Neighs:        66411.2 ave       70713 max       62416 min
+Histogram: 1 0 1 0 0 0 1 0 0 1
+
+Total # of neighbors = 265645
+Ave neighs/atom = 106.25800
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 13
+Dangerous builds = 0
+
+#set up integration parameters
+fix             timeintegration all nve
+fix             thermostat all langevin  3.0000e+02  3.0000e+02  1.0000e+02 59915
+
+#Equilibration (for realistic results, run for 2000000  steps)
+reset_timestep  0
+thermo          200
+thermo_style    custom etotal ke pe temp evdwl
+
+#run equilibration
+run             2000
+  generated 0 of 0 mixed pair_coeff terms from geometric mixing rule
+WARNING: Fix recenter should come after all other integration fixes (src/fix_recenter.cpp:133)
+Per MPI rank memory allocation (min/avg/max) = 6.730 | 6.730 | 6.731 Mbytes
+TotEng KinEng PotEng Temp E_vdwl 
+   -146.7158    2234.7133   -2381.4291          300   -2381.4291 
+   540.68168      2041.44   -1500.7584    274.05395   -1500.7584 
+    945.4949    2163.7509    -1218.256    290.47363    -1218.256 
+   1118.7729    2195.7579    -1076.985    294.77042    -1076.985 
+   1215.0058    2233.2445   -1018.2387    299.80282   -1018.2387 
+   1251.8045    2240.8439   -989.03944      300.823   -989.03944 
+    1206.649    2149.5807   -942.93169    288.57134   -942.93169 
+   1290.6111    2248.3623   -957.75117    301.83231   -957.75117 
+   1312.8944     2219.147   -906.25264     297.9103   -906.25264 
+    1260.002    2211.4176   -951.41561    296.87266   -951.41561 
+   1335.0956    2270.1367   -935.04108    304.75543   -935.04108 
+Loop time of 3.56721 on 4 procs for 2000 steps with 2500 atoms
+
+Performance: 48.441 ns/day, 0.495 hours/ns, 560.663 timesteps/s
+99.8% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 3.3122     | 3.3399     | 3.3633     |   1.0 | 93.63
+Bond    | 7.5941e-05 | 8.062e-05  | 8.7627e-05 |   0.0 |  0.00
+Neigh   | 0.03524    | 0.036666   | 0.037864   |   0.6 |  1.03
+Comm    | 0.080116   | 0.10444    | 0.13373    |   6.1 |  2.93
+Output  | 0.00019977 | 0.00022502 | 0.00029007 |   0.0 |  0.01
+Modify  | 0.077781   | 0.078206   | 0.078752   |   0.1 |  2.19
+Other   |            | 0.007641   |            |       |  0.21
+
+Nlocal:        625.000 ave         637 max         616 min
+Histogram: 1 0 1 0 1 0 0 0 0 1
+Nghost:        3597.25 ave        3610 max        3586 min
+Histogram: 1 0 1 0 0 0 1 0 0 1
+Neighs:        66468.2 ave       69230 max       62721 min
+Histogram: 1 0 0 1 0 0 0 0 0 2
+
+Total # of neighbors = 265873
+Ave neighs/atom = 106.34920
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 20
+Dangerous builds = 0
+
+#turn off recentering during production run
+unfix recentering
+
+
+#setup trajectory output
+dump            myDump all custom 100 methanol_implicit_water.lammpstrj.gz id type x y z element
+dump_modify     myDump element M
+dump_modify     myDump sort id
+
+#run production (for realistic results, run for 10000000 steps)
+reset_timestep  0
+thermo          1000
+thermo_style    custom etotal ke pe temp  evdwl
+run             10000
+  generated 0 of 0 mixed pair_coeff terms from geometric mixing rule
+Per MPI rank memory allocation (min/avg/max) = 8.071 | 8.071 | 8.071 Mbytes
+TotEng KinEng PotEng Temp E_vdwl 
+   1335.0956    2270.1367   -935.04108    304.75543   -935.04108 
+   1266.2305    2227.2123   -960.98186    298.99303   -960.98186 
+   1304.2289    2238.1343   -933.90544    300.45925   -933.90544 
+   1311.3201    2232.0862    -920.7661    299.64733    -920.7661 
+   1289.9028    2241.3533   -951.45049    300.89139   -951.45049 
+   1314.2234    2244.8514   -930.62797      301.361   -930.62797 
+   1282.2744    2240.6716   -958.39719    300.79987   -958.39719 
+    1239.302    2181.5711    -942.2691    292.86591    -942.2691 
+   1327.0954    2242.6441   -915.54875    301.06468   -915.54875 
+   1334.9799    2239.6841   -904.70423    300.66731   -904.70423 
+   1320.6105    2263.4912   -942.88066     303.8633   -942.88066 
+Loop time of 23.3399 on 4 procs for 10000 steps with 2500 atoms
+
+Performance: 37.018 ns/day, 0.648 hours/ns, 428.451 timesteps/s
+99.5% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 21.343     | 21.606     | 21.766     |   3.7 | 92.57
+Bond    | 0.00045963 | 0.0004817  | 0.0005083  |   0.0 |  0.00
+Neigh   | 0.20708    | 0.22081    | 0.22733    |   1.7 |  0.95
+Comm    | 0.63014    | 0.80326    | 1.0801     |  19.8 |  3.44
+Output  | 0.11791    | 0.14443    | 0.22211    |  11.8 |  0.62
+Modify  | 0.37291    | 0.389      | 0.41719    |   2.7 |  1.67
+Other   |            | 0.1761     |            |       |  0.75
+
+Nlocal:        625.000 ave         636 max         613 min
+Histogram: 1 0 0 0 0 2 0 0 0 1
+Nghost:        3597.00 ave        3613 max        3580 min
+Histogram: 1 0 0 1 0 0 0 1 0 1
+Neighs:        66408.5 ave       69186 max       61728 min
+Histogram: 1 0 0 0 0 0 1 0 1 1
+
+Total # of neighbors = 265634
+Ave neighs/atom = 106.25360
+Ave special neighs/atom = 0.0000000
+Neighbor list builds = 102
+Dangerous builds = 0
+
+
+Total wall time: 0:00:27
--- a/examples/threebody/SiC.tersoff.zbl
+++ b/examples/threebody/SiC.tersoff.zbl
@ -0,0 +1 @@
+../../potentials/SiC.tersoff.zbl
--- a/examples/threebody/in.threebody
+++ b/examples/threebody/in.threebody
@ -7,7 +7,7 @@ units           metal
 atom_style      atomic
 atom_modify     map array
 boundary        p p p
-atom_modify	sort 0 0.0
+atom_modify     sort 0 0.0

 # temperature

@ -35,23 +35,23 @@ region          myreg block     0 4 &

 create_box      8 myreg
 create_atoms    1 region myreg &
-		basis 1 1  &
-		basis 2 2  &
-		basis 3 3  &
-		basis 4 4  &
-		basis 5 5  &
-		basis 6 6  &
-		basis 7 7  &
-		basis 8 8
+                basis 1 1  &
+                basis 2 2  &
+                basis 3 3  &
+                basis 4 4  &
+                basis 5 5  &
+                basis 6 6  &
+                basis 7 7  &
+                basis 8 8

 mass            *       28.06

-velocity 	all create $t 5287287 loop geom
+velocity        all create $t 5287287 loop geom

 # Equilibrate using Stillinger-Weber model for silicon

 pair_style      sw
-pair_coeff 	* * Si.sw Si Si Si Si Si Si Si Si
+pair_coeff      * * Si.sw Si Si Si Si Si Si Si Si

 thermo_style    custom step temp epair etotal econserve press
 thermo          10
@ -61,15 +61,15 @@ neighbor        1.0 bin
 neigh_modify    every 1 delay 10 check yes
 run             100

-write_restart	restart.equil
+write_restart   restart.equil

 # Test Stillinger-Weber model for Cd/Te/Zn/Se/Hg/S

 clear
-read_restart	restart.equil
+read_restart    restart.equil

 pair_style      sw
-pair_coeff 	* * CdTeZnSeHgS0.sw Cd Zn Hg Cd Te S Se Te
+pair_coeff      * * CdTeZnSeHgS0.sw Cd Zn Hg Cd Te S Se Te

 thermo_style    custom step temp epair etotal econserve press
 thermo          10
@ -82,10 +82,10 @@ run             100
 # Test Vashishta model for In/P

 clear
-read_restart	restart.equil
+read_restart    restart.equil

 pair_style      vashishta
-pair_coeff 	* * InP.vashishta In In In In P P P P
+pair_coeff      * * InP.vashishta In In In In P P P P

 thermo_style    custom step temp epair etotal econserve press
 thermo          10
@ -98,13 +98,13 @@ run             100
 # Test Tersoff model for B/N/C 

 clear
-read_restart	restart.equil
+read_restart    restart.equil

-variable	fac equal 0.6
-change_box 	all x scale ${fac} y scale ${fac} z scale ${fac} remap
+variable        fac equal 0.6
+change_box      all x scale ${fac} y scale ${fac} z scale ${fac} remap

 pair_style      tersoff
-pair_coeff 	* * BNC.tersoff N N N C B B C B
+pair_coeff      * * BNC.tersoff N N N C B B C B

 thermo_style    custom step temp epair etotal econserve press
 thermo          10
@ -114,3 +114,23 @@ neighbor        1.0 bin
 neigh_modify    every 1 delay 10 check yes
 run             100

+# Test Tersoff/ZBL model for SiC
+
+clear
+read_restart    restart.equil
+
+variable        fac equal 0.6
+change_box      all x scale ${fac} y scale ${fac} z scale ${fac} remap
+
+pair_style      tersoff/zbl
+pair_coeff      * * SiC.tersoff.zbl C C C C Si Si Si Si
+
+thermo_style    custom step temp epair etotal econserve press
+thermo          10
+fix             1 all nvt temp $t $t 0.1
+timestep        1.0e-3
+neighbor        1.0 bin
+neigh_modify    every 1 delay 10 check yes
+run             100
+
+shell rm restart.equil
--- a/examples/threebody/log.29Sep21.threebody.g++.1
+++ b/examples/threebody/log.29Sep21.threebody.g++.1
@ -1,4 +1,4 @@
-LAMMPS (24 Dec 2020)
+LAMMPS (29 Sep 2021 - Update 3)
  using 1 OpenMP thread(s) per MPI task
 # Simple regression tests for threebody potentials

@ -9,7 +9,7 @@ units           metal
 atom_style      atomic
 atom_modify     map array
 boundary        p p p
-atom_modify	sort 0 0.0
+atom_modify     sort 0 0.0

 # temperature

@ -27,19 +27,20 @@ region          myreg block     0 4                                 0 4
 create_box      8 myreg
 Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (21.724000 21.724000 21.724000)
  1 by 1 by 1 MPI processor grid
-create_atoms    1 region myreg 		basis 1 1  		basis 2 2  		basis 3 3  		basis 4 4  		basis 5 5  		basis 6 6  		basis 7 7  		basis 8 8
+create_atoms    1 region myreg                 basis 1 1                  basis 2 2                  basis 3 3                  basis 4 4                  basis 5 5                  basis 6 6                  basis 7 7                  basis 8 8
 Created 512 atoms
-  create_atoms CPU = 0.001 seconds
+  using lattice units in orthogonal box = (0.0000000 0.0000000 0.0000000) to (21.724000 21.724000 21.724000)
+  create_atoms CPU = 0.000 seconds

 mass            *       28.06

-velocity 	all create $t 5287287 loop geom
-velocity 	all create 1800 5287287 loop geom
+velocity        all create $t 5287287 loop geom
+velocity        all create 1800 5287287 loop geom

 # Equilibrate using Stillinger-Weber model for silicon

 pair_style      sw
-pair_coeff 	* * Si.sw Si Si Si Si Si Si Si Si
+pair_coeff      * * Si.sw Si Si Si Si Si Si Si Si
 Reading sw potential file Si.sw with DATE: 2007-06-11

 thermo_style    custom step temp epair etotal econserve press
@ -76,20 +77,20 @@ Step Temp E_pair TotEng Econserve Press
      80    800.80221   -2146.1371   -2093.2426    -2101.313     11995.66 
      90    1293.9689   -2176.9021   -2091.4329   -2101.3848     11692.45 
     100    1112.9699   -2162.7259   -2089.2121   -2101.3478    12263.758 
-Loop time of 0.157871 on 1 procs for 100 steps with 512 atoms
+Loop time of 0.093281 on 1 procs for 100 steps with 512 atoms

-Performance: 54.728 ns/day, 0.439 hours/ns, 633.430 timesteps/s
+Performance: 92.623 ns/day, 0.259 hours/ns, 1072.029 timesteps/s
 99.8% CPU use with 1 MPI tasks x 1 OpenMP threads

 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.14704    | 0.14704    | 0.14704    |   0.0 | 93.14
-Neigh   | 0.00247    | 0.00247    | 0.00247    |   0.0 |  1.56
-Comm    | 0.0024729  | 0.0024729  | 0.0024729  |   0.0 |  1.57
-Output  | 0.0002656  | 0.0002656  | 0.0002656  |   0.0 |  0.17
-Modify  | 0.0050237  | 0.0050237  | 0.0050237  |   0.0 |  3.18
-Other   |            | 0.0006011  |            |       |  0.38
+Pair    | 0.090256   | 0.090256   | 0.090256   |   0.0 | 96.76
+Neigh   | 0.0015078  | 0.0015078  | 0.0015078  |   0.0 |  1.62
+Comm    | 0.00045896 | 0.00045896 | 0.00045896 |   0.0 |  0.49
+Output  | 8.3447e-05 | 8.3447e-05 | 8.3447e-05 |   0.0 |  0.09
+Modify  | 0.00072384 | 0.00072384 | 0.00072384 |   0.0 |  0.78
+Other   |            | 0.0002506  |            |       |  0.27

 Nlocal:        512.000 ave         512 max         512 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
@ -105,25 +106,25 @@ Ave neighs/atom = 27.320312
 Neighbor list builds = 2
 Dangerous builds = 0

-write_restart	restart.equil
+write_restart   restart.equil
 System init for write_restart ...

 # Test Stillinger-Weber model for Cd/Te/Zn/Se/Hg/S

 clear
  using 1 OpenMP thread(s) per MPI task
-read_restart	restart.equil
+read_restart    restart.equil
 Reading restart file ...
-  restart file = 24 Dec 2020, LAMMPS = 24 Dec 2020
+  restart file = 29 Sep 2021, LAMMPS = 29 Sep 2021
  restoring atom style atomic from restart
  orthogonal box = (0.0000000 0.0000000 0.0000000) to (21.724000 21.724000 21.724000)
  1 by 1 by 1 MPI processor grid
  pair style sw stores no restart info
  512 atoms
-  read_restart CPU = 0.001 seconds
+  read_restart CPU = 0.000 seconds

 pair_style      sw
-pair_coeff 	* * CdTeZnSeHgS0.sw Cd Zn Hg Cd Te S Se Te
+pair_coeff      * * CdTeZnSeHgS0.sw Cd Zn Hg Cd Te S Se Te
 Reading sw potential file CdTeZnSeHgS0.sw with DATE: 2013-08-09

 thermo_style    custom step temp epair etotal econserve press
@ -163,20 +164,20 @@ Step Temp E_pair TotEng Econserve Press
     180    1856.1197   -657.14338   -534.54309   -564.48754    488372.27 
     190    1346.1107   -621.42431    -532.5111   -564.38065    511750.04 
     200    1919.5266   -657.26587   -530.47743   -564.47797    488684.56 
-Loop time of 0.455825 on 1 procs for 100 steps with 512 atoms
+Loop time of 0.245572 on 1 procs for 100 steps with 512 atoms

-Performance: 18.955 ns/day, 1.266 hours/ns, 219.382 timesteps/s
-99.9% CPU use with 1 MPI tasks x 1 OpenMP threads
+Performance: 35.183 ns/day, 0.682 hours/ns, 407.212 timesteps/s
+99.8% CPU use with 1 MPI tasks x 1 OpenMP threads

 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.44091    | 0.44091    | 0.44091    |   0.0 | 96.73
-Neigh   | 0.0054555  | 0.0054555  | 0.0054555  |   0.0 |  1.20
-Comm    | 0.0035784  | 0.0035784  | 0.0035784  |   0.0 |  0.79
-Output  | 0.00024486 | 0.00024486 | 0.00024486 |   0.0 |  0.05
-Modify  | 0.0050471  | 0.0050471  | 0.0050471  |   0.0 |  1.11
-Other   |            | 0.000592   |            |       |  0.13
+Pair    | 0.24139    | 0.24139    | 0.24139    |   0.0 | 98.30
+Neigh   | 0.0027068  | 0.0027068  | 0.0027068  |   0.0 |  1.10
+Comm    | 0.00051188 | 0.00051188 | 0.00051188 |   0.0 |  0.21
+Output  | 0.00010395 | 0.00010395 | 0.00010395 |   0.0 |  0.04
+Modify  | 0.00059605 | 0.00059605 | 0.00059605 |   0.0 |  0.24
+Other   |            | 0.0002608  |            |       |  0.11

 Nlocal:        512.000 ave         512 max         512 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
@ -196,18 +197,18 @@ Dangerous builds = 0

 clear
  using 1 OpenMP thread(s) per MPI task
-read_restart	restart.equil
+read_restart    restart.equil
 Reading restart file ...
-  restart file = 24 Dec 2020, LAMMPS = 24 Dec 2020
+  restart file = 29 Sep 2021, LAMMPS = 29 Sep 2021
  restoring atom style atomic from restart
  orthogonal box = (0.0000000 0.0000000 0.0000000) to (21.724000 21.724000 21.724000)
  1 by 1 by 1 MPI processor grid
  pair style sw stores no restart info
  512 atoms
-  read_restart CPU = 0.001 seconds
+  read_restart CPU = 0.000 seconds

 pair_style      vashishta
-pair_coeff 	* * InP.vashishta In In In In P P P P
+pair_coeff      * * InP.vashishta In In In In P P P P
 Reading vashishta potential file InP.vashishta with DATE: 2015-10-14

 thermo_style    custom step temp epair etotal econserve press
@ -247,20 +248,20 @@ Step Temp E_pair TotEng Econserve Press
     180    1302.9041   -1491.7765   -1405.7172   -1435.8971    249514.04 
     190    1332.3326   -1491.5271    -1403.524   -1435.9213    227537.99 
     200    1352.1813   -1490.4513   -1401.1371   -1435.9049    207626.42 
-Loop time of 0.217808 on 1 procs for 100 steps with 512 atoms
+Loop time of 0.111899 on 1 procs for 100 steps with 512 atoms

-Performance: 39.668 ns/day, 0.605 hours/ns, 459.121 timesteps/s
-98.2% CPU use with 1 MPI tasks x 1 OpenMP threads
+Performance: 77.212 ns/day, 0.311 hours/ns, 893.662 timesteps/s
+99.9% CPU use with 1 MPI tasks x 1 OpenMP threads

 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.19635    | 0.19635    | 0.19635    |   0.0 | 90.15
-Neigh   | 0.01054    | 0.01054    | 0.01054    |   0.0 |  4.84
-Comm    | 0.0051923  | 0.0051923  | 0.0051923  |   0.0 |  2.38
-Output  | 0.00027919 | 0.00027919 | 0.00027919 |   0.0 |  0.13
-Modify  | 0.0048637  | 0.0048637  | 0.0048637  |   0.0 |  2.23
-Other   |            | 0.0005858  |            |       |  0.27
+Pair    | 0.10539    | 0.10539    | 0.10539    |   0.0 | 94.18
+Neigh   | 0.0049229  | 0.0049229  | 0.0049229  |   0.0 |  4.40
+Comm    | 0.00068307 | 0.00068307 | 0.00068307 |   0.0 |  0.61
+Output  | 6.1989e-05 | 6.1989e-05 | 6.1989e-05 |   0.0 |  0.06
+Modify  | 0.00058532 | 0.00058532 | 0.00058532 |   0.0 |  0.52
+Other   |            | 0.0002604  |            |       |  0.23

 Nlocal:        512.000 ave         512 max         512 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
@ -280,28 +281,28 @@ Dangerous builds = 0

 clear
  using 1 OpenMP thread(s) per MPI task
-read_restart	restart.equil
+read_restart    restart.equil
 Reading restart file ...
-  restart file = 24 Dec 2020, LAMMPS = 24 Dec 2020
+  restart file = 29 Sep 2021, LAMMPS = 29 Sep 2021
  restoring atom style atomic from restart
  orthogonal box = (0.0000000 0.0000000 0.0000000) to (21.724000 21.724000 21.724000)
  1 by 1 by 1 MPI processor grid
  pair style sw stores no restart info
  512 atoms
-  read_restart CPU = 0.001 seconds
+  read_restart CPU = 0.000 seconds

-variable	fac equal 0.6
-change_box 	all x scale ${fac} y scale ${fac} z scale ${fac} remap
-change_box 	all x scale 0.6 y scale ${fac} z scale ${fac} remap
-change_box 	all x scale 0.6 y scale 0.6 z scale ${fac} remap
-change_box 	all x scale 0.6 y scale 0.6 z scale 0.6 remap
+variable        fac equal 0.6
+change_box      all x scale ${fac} y scale ${fac} z scale ${fac} remap
+change_box      all x scale 0.6 y scale ${fac} z scale ${fac} remap
+change_box      all x scale 0.6 y scale 0.6 z scale ${fac} remap
+change_box      all x scale 0.6 y scale 0.6 z scale 0.6 remap
 Changing box ...
  orthogonal box = (4.3448000 0.0000000 0.0000000) to (17.379200 21.724000 21.724000)
  orthogonal box = (4.3448000 4.3448000 0.0000000) to (17.379200 17.379200 21.724000)
  orthogonal box = (4.3448000 4.3448000 4.3448000) to (17.379200 17.379200 17.379200)

 pair_style      tersoff
-pair_coeff 	* * BNC.tersoff N N N C B B C B
+pair_coeff      * * BNC.tersoff N N N C B B C B
 Reading tersoff potential file BNC.tersoff with DATE: 2013-03-21

 thermo_style    custom step temp epair etotal econserve press
@ -341,20 +342,20 @@ Step Temp E_pair TotEng Econserve Press
     180    1337.4358   -3254.9844   -3166.6442   -3196.8222    1880420.9 
     190    1441.8052   -3259.0364   -3163.8023   -3196.3556    1904512.1 
     200    1569.0317   -3265.0089   -3161.3714   -3196.3328    1899462.7 
-Loop time of 0.487425 on 1 procs for 100 steps with 512 atoms
+Loop time of 0.097734 on 1 procs for 100 steps with 512 atoms

-Performance: 17.726 ns/day, 1.354 hours/ns, 205.160 timesteps/s
-99.1% CPU use with 1 MPI tasks x 1 OpenMP threads
+Performance: 88.403 ns/day, 0.271 hours/ns, 1023.186 timesteps/s
+99.7% CPU use with 1 MPI tasks x 1 OpenMP threads

 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.47762    | 0.47762    | 0.47762    |   0.0 | 97.99
-Neigh   | 0.0014286  | 0.0014286  | 0.0014286  |   0.0 |  0.29
-Comm    | 0.0024068  | 0.0024068  | 0.0024068  |   0.0 |  0.49
-Output  | 0.00028992 | 0.00028992 | 0.00028992 |   0.0 |  0.06
-Modify  | 0.0050635  | 0.0050635  | 0.0050635  |   0.0 |  1.04
-Other   |            | 0.0006182  |            |       |  0.13
+Pair    | 0.095481   | 0.095481   | 0.095481   |   0.0 | 97.69
+Neigh   | 0.000772   | 0.000772   | 0.000772   |   0.0 |  0.79
+Comm    | 0.00046158 | 0.00046158 | 0.00046158 |   0.0 |  0.47
+Output  | 6.7949e-05 | 6.7949e-05 | 6.7949e-05 |   0.0 |  0.07
+Modify  | 0.00068784 | 0.00068784 | 0.00068784 |   0.0 |  0.70
+Other   |            | 0.0002635  |            |       |  0.27

 Nlocal:        512.000 ave         512 max         512 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
@ -370,4 +371,99 @@ Ave neighs/atom = 28.523438
 Neighbor list builds = 1
 Dangerous builds = 0

+# Test Tersoff/ZBL model for SiC
+
+clear
+  using 1 OpenMP thread(s) per MPI task
+read_restart    restart.equil
+Reading restart file ...
+  restart file = 29 Sep 2021, LAMMPS = 29 Sep 2021
+  restoring atom style atomic from restart
+  orthogonal box = (0.0000000 0.0000000 0.0000000) to (21.724000 21.724000 21.724000)
+  1 by 1 by 1 MPI processor grid
+  pair style sw stores no restart info
+  512 atoms
+  read_restart CPU = 0.000 seconds
+
+variable        fac equal 0.6
+change_box      all x scale ${fac} y scale ${fac} z scale ${fac} remap
+change_box      all x scale 0.6 y scale ${fac} z scale ${fac} remap
+change_box      all x scale 0.6 y scale 0.6 z scale ${fac} remap
+change_box      all x scale 0.6 y scale 0.6 z scale 0.6 remap
+Changing box ...
+  orthogonal box = (4.3448000 0.0000000 0.0000000) to (17.379200 21.724000 21.724000)
+  orthogonal box = (4.3448000 4.3448000 0.0000000) to (17.379200 17.379200 21.724000)
+  orthogonal box = (4.3448000 4.3448000 4.3448000) to (17.379200 17.379200 17.379200)
+
+pair_style      tersoff/zbl
+pair_coeff      * * SiC.tersoff.zbl C C C C Si Si Si Si
+Reading tersoff/zbl potential file SiC.tersoff.zbl with DATE: 2009-04-15
+
+thermo_style    custom step temp epair etotal econserve press
+thermo          10
+fix             1 all nvt temp $t $t 0.1
+fix             1 all nvt temp 1800 $t 0.1
+fix             1 all nvt temp 1800 1800 0.1
+Resetting global fix info from restart file:
+  fix style: nvt, fix ID: 1
+timestep        1.0e-3
+neighbor        1.0 bin
+neigh_modify    every 1 delay 10 check yes
+run             100
+All restart file global fix info was re-assigned
+Neighbor list info ...
+  update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 4
+  ghost atom cutoff = 4
+  binsize = 2, bins = 7 7 7
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair tersoff/zbl, perpetual
+      attributes: full, newton on
+      pair build: full/bin/atomonly
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 3.002 | 3.002 | 3.002 Mbytes
+Step Temp E_pair TotEng Econserve Press 
+     100    1112.9699    7067.9634    7141.4772    7129.3415     17683957 
+     110     1676.669    7033.1458     7143.893    7128.6921     17837566 
+     120    2450.2667    6982.2491     7144.094    7126.9524     18220027 
+     130    2726.9659    6964.1219    7144.2432    7126.7678     18230324 
+     140     2729.421    6962.7393    7143.0228    7127.2074     18176317 
+     150    2738.5449    6959.1761    7140.0623    7127.6671     18068370 
+     160    2687.2419    6958.1183    7135.6158    7127.8492     18156214 
+     170    2697.7325    6952.1482    7130.3387    7127.7898     17978251 
+     180    2577.9885    6954.5611    7124.8422    7127.5615     18068920 
+     190    2502.6928    6954.4558    7119.7635      7127.67     18049652 
+     200    2517.4866     6947.962    7114.2469    7127.1972     18209451 
+Loop time of 0.783169 on 1 procs for 100 steps with 512 atoms
+
+Performance: 11.032 ns/day, 2.175 hours/ns, 127.686 timesteps/s
+99.9% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.78056    | 0.78056    | 0.78056    |   0.0 | 99.67
+Neigh   | 0.0011299  | 0.0011299  | 0.0011299  |   0.0 |  0.14
+Comm    | 0.00051332 | 0.00051332 | 0.00051332 |   0.0 |  0.07
+Output  | 9.2268e-05 | 9.2268e-05 | 9.2268e-05 |   0.0 |  0.01
+Modify  | 0.00060058 | 0.00060058 | 0.00060058 |   0.0 |  0.08
+Other   |            | 0.0002706  |            |       |  0.03
+
+Nlocal:        512.000 ave         512 max         512 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        1542.00 ave        1542 max        1542 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:         0.00000 ave           0 max           0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:      30142.0 ave       30142 max       30142 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 30142
+Ave neighs/atom = 58.871094
+Neighbor list builds = 1
+Dangerous builds = 0
+
+shell rm restart.equil
 Total wall time: 0:00:01
--- a/examples/threebody/log.29Sep21.threebody.g++.4
+++ b/examples/threebody/log.29Sep21.threebody.g++.4
@ -1,4 +1,4 @@
-LAMMPS (24 Dec 2020)
+LAMMPS (29 Sep 2021 - Update 3)
  using 1 OpenMP thread(s) per MPI task
 # Simple regression tests for threebody potentials

@ -9,7 +9,7 @@ units           metal
 atom_style      atomic
 atom_modify     map array
 boundary        p p p
-atom_modify	sort 0 0.0
+atom_modify     sort 0 0.0

 # temperature

@ -26,20 +26,21 @@ region          myreg block     0 4                                 0 4

 create_box      8 myreg
 Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (21.724000 21.724000 21.724000)
-  1 by 2 by 2 MPI processor grid
-create_atoms    1 region myreg 		basis 1 1  		basis 2 2  		basis 3 3  		basis 4 4  		basis 5 5  		basis 6 6  		basis 7 7  		basis 8 8
+  1 by 1 by 1 MPI processor grid
+create_atoms    1 region myreg                 basis 1 1                  basis 2 2                  basis 3 3                  basis 4 4                  basis 5 5                  basis 6 6                  basis 7 7                  basis 8 8
 Created 512 atoms
-  create_atoms CPU = 0.074 seconds
+  using lattice units in orthogonal box = (0.0000000 0.0000000 0.0000000) to (21.724000 21.724000 21.724000)
+  create_atoms CPU = 0.000 seconds

 mass            *       28.06

-velocity 	all create $t 5287287 loop geom
-velocity 	all create 1800 5287287 loop geom
+velocity        all create $t 5287287 loop geom
+velocity        all create 1800 5287287 loop geom

 # Equilibrate using Stillinger-Weber model for silicon

 pair_style      sw
-pair_coeff 	* * Si.sw Si Si Si Si Si Si Si Si
+pair_coeff      * * Si.sw Si Si Si Si Si Si Si Si
 Reading sw potential file Si.sw with DATE: 2007-06-11

 thermo_style    custom step temp epair etotal econserve press
@ -63,7 +64,7 @@ Neighbor list info ...
      pair build: full/bin/atomonly
      stencil: full/bin/3d
      bin: standard
-Per MPI rank memory allocation (min/avg/max) = 2.958 | 2.958 | 2.958 Mbytes
+Per MPI rank memory allocation (min/avg/max) = 2.983 | 2.983 | 2.983 Mbytes
 Step Temp E_pair TotEng Econserve Press 
       0         1800   -2220.3392   -2101.4457   -2101.4457    12358.626 
      10    1006.0192   -2167.7053   -2101.2558   -2101.3286    13892.426 
@ -76,54 +77,54 @@ Step Temp E_pair TotEng Econserve Press
      80    800.80221   -2146.1371   -2093.2426    -2101.313     11995.66 
      90    1293.9689   -2176.9021   -2091.4329   -2101.3848     11692.45 
     100    1112.9699   -2162.7259   -2089.2121   -2101.3478    12263.758 
-Loop time of 0.0998364 on 4 procs for 100 steps with 512 atoms
+Loop time of 0.089642 on 1 procs for 100 steps with 512 atoms

-Performance: 86.542 ns/day, 0.277 hours/ns, 1001.639 timesteps/s
-81.4% CPU use with 4 MPI tasks x 1 OpenMP threads
+Performance: 96.383 ns/day, 0.249 hours/ns, 1115.548 timesteps/s
+99.7% CPU use with 1 MPI tasks x 1 OpenMP threads

 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.037337   | 0.049389   | 0.069239   |   5.9 | 49.47
-Neigh   | 0.00067854 | 0.00068814 | 0.00070286 |   0.0 |  0.69
-Comm    | 0.025239   | 0.04504    | 0.056869   |   6.1 | 45.11
-Output  | 0.00015712 | 0.00082219 | 0.0028148  |   0.0 |  0.82
-Modify  | 0.0014369  | 0.0015754  | 0.0016632  |   0.2 |  1.58
-Other   |            | 0.002321   |            |       |  2.33
+Pair    | 0.086619   | 0.086619   | 0.086619   |   0.0 | 96.63
+Neigh   | 0.0015211  | 0.0015211  | 0.0015211  |   0.0 |  1.70
+Comm    | 0.000458   | 0.000458   | 0.000458   |   0.0 |  0.51
+Output  | 7.987e-05  | 7.987e-05  | 7.987e-05  |   0.0 |  0.09
+Modify  | 0.00073361 | 0.00073361 | 0.00073361 |   0.0 |  0.82
+Other   |            | 0.0002301  |            |       |  0.26

-Nlocal:        128.000 ave         132 max         125 min
-Histogram: 1 1 0 0 0 1 0 0 0 1
-Nghost:        525.000 ave         528 max         521 min
-Histogram: 1 0 0 0 1 0 0 0 1 1
+Nlocal:        512.000 ave         512 max         512 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        1017.00 ave        1017 max        1017 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
 Neighs:         0.00000 ave           0 max           0 min
-Histogram: 4 0 0 0 0 0 0 0 0 0
-FullNghs:      3497.00 ave        3619 max        3397 min
-Histogram: 1 1 0 0 0 0 1 0 0 1
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:      13988.0 ave       13988 max       13988 min
+Histogram: 1 0 0 0 0 0 0 0 0 0

 Total # of neighbors = 13988
 Ave neighs/atom = 27.320312
 Neighbor list builds = 2
 Dangerous builds = 0

-write_restart	restart.equil
+write_restart   restart.equil
 System init for write_restart ...

 # Test Stillinger-Weber model for Cd/Te/Zn/Se/Hg/S

 clear
  using 1 OpenMP thread(s) per MPI task
-read_restart	restart.equil
+read_restart    restart.equil
 Reading restart file ...
-  restart file = 24 Dec 2020, LAMMPS = 24 Dec 2020
+  restart file = 29 Sep 2021, LAMMPS = 29 Sep 2021
  restoring atom style atomic from restart
  orthogonal box = (0.0000000 0.0000000 0.0000000) to (21.724000 21.724000 21.724000)
-  1 by 2 by 2 MPI processor grid
+  1 by 1 by 1 MPI processor grid
  pair style sw stores no restart info
  512 atoms
-  read_restart CPU = 0.001 seconds
+  read_restart CPU = 0.000 seconds

 pair_style      sw
-pair_coeff 	* * CdTeZnSeHgS0.sw Cd Zn Hg Cd Te S Se Te
+pair_coeff      * * CdTeZnSeHgS0.sw Cd Zn Hg Cd Te S Se Te
 Reading sw potential file CdTeZnSeHgS0.sw with DATE: 2013-08-09

 thermo_style    custom step temp epair etotal econserve press
@ -150,42 +151,42 @@ Neighbor list info ...
      pair build: full/bin/atomonly
      stencil: full/bin/3d
      bin: standard
-Per MPI rank memory allocation (min/avg/max) = 2.967 | 2.967 | 2.968 Mbytes
+Per MPI rank memory allocation (min/avg/max) = 3.001 | 3.001 | 3.001 Mbytes
 Step Temp E_pair TotEng Econserve Press 
-     100    1112.9699   -625.76163   -552.24782   -564.38354    462129.66 
+     100    1112.9699   -625.76163   -552.24781   -564.38354    462129.66 
     110    1502.8461   -649.55768   -550.29179   -564.45814    463413.45 
-     120    1926.4523   -674.71265   -547.46675   -564.53613    486338.88 
-     130    1152.6663   -621.47265   -545.33681   -564.37203    514892.19 
+     120    1926.4523   -674.71265   -547.46675   -564.53612    486338.88 
+     130    1152.6663   -621.47264   -545.33681   -564.37203     514892.2 
     140     1762.244   -659.86941   -543.46979    -564.4985    488159.88 
-     150    1767.8665   -657.67179   -540.90079   -564.48386    466721.31 
-     160    1075.2874    -610.1281   -539.10328   -564.36709     470151.9 
-     170    1697.9313    -649.3684   -537.21676   -564.47208     467953.7 
-     180    1856.1197   -657.14338   -534.54309   -564.48754    488372.26 
-     190    1346.1107   -621.42432    -532.5111   -564.38065    511750.03 
+     150    1767.8665   -657.67178   -540.90078   -564.48386    466721.31 
+     160    1075.2874   -610.12809   -539.10328   -564.36709     470151.9 
+     170    1697.9313    -649.3684   -537.21675   -564.47207    467953.71 
+     180    1856.1197   -657.14338   -534.54309   -564.48754    488372.27 
+     190    1346.1107   -621.42431    -532.5111   -564.38065    511750.04 
     200    1919.5266   -657.26587   -530.47743   -564.47797    488684.56 
-Loop time of 0.286556 on 4 procs for 100 steps with 512 atoms
+Loop time of 0.268183 on 1 procs for 100 steps with 512 atoms

-Performance: 30.151 ns/day, 0.796 hours/ns, 348.971 timesteps/s
-81.7% CPU use with 4 MPI tasks x 1 OpenMP threads
+Performance: 32.217 ns/day, 0.745 hours/ns, 372.880 timesteps/s
+99.8% CPU use with 1 MPI tasks x 1 OpenMP threads

 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.11093    | 0.139      | 0.16864    |   5.8 | 48.51
-Neigh   | 0.0014305  | 0.0014756  | 0.0015156  |   0.1 |  0.51
-Comm    | 0.10154    | 0.12374    | 0.16907    |   7.8 | 43.18
-Output  | 0.0001862  | 0.00030428 | 0.0006578  |   0.0 |  0.11
-Modify  | 0.0038164  | 0.019159   | 0.034146   |  10.8 |  6.69
-Other   |            | 0.002872   |            |       |  1.00
+Pair    | 0.26374    | 0.26374    | 0.26374    |   0.0 | 98.34
+Neigh   | 0.0027301  | 0.0027301  | 0.0027301  |   0.0 |  1.02
+Comm    | 0.00063014 | 0.00063014 | 0.00063014 |   0.0 |  0.23
+Output  | 8.4639e-05 | 8.4639e-05 | 8.4639e-05 |   0.0 |  0.03
+Modify  | 0.00072742 | 0.00072742 | 0.00072742 |   0.0 |  0.27
+Other   |            | 0.0002725  |            |       |  0.10

-Nlocal:        128.000 ave         135 max         122 min
-Histogram: 1 0 1 0 0 0 1 0 0 1
-Nghost:        759.750 ave         770 max         751 min
-Histogram: 1 0 0 1 1 0 0 0 0 1
+Nlocal:        512.000 ave         512 max         512 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        1428.00 ave        1428 max        1428 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
 Neighs:         0.00000 ave           0 max           0 min
-Histogram: 4 0 0 0 0 0 0 0 0 0
-FullNghs:      4336.00 ave        4563 max        4128 min
-Histogram: 1 0 1 0 0 0 1 0 0 1
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:      17344.0 ave       17344 max       17344 min
+Histogram: 1 0 0 0 0 0 0 0 0 0

 Total # of neighbors = 17344
 Ave neighs/atom = 33.875000
@ -196,18 +197,18 @@ Dangerous builds = 0

 clear
  using 1 OpenMP thread(s) per MPI task
-read_restart	restart.equil
+read_restart    restart.equil
 Reading restart file ...
-  restart file = 24 Dec 2020, LAMMPS = 24 Dec 2020
+  restart file = 29 Sep 2021, LAMMPS = 29 Sep 2021
  restoring atom style atomic from restart
  orthogonal box = (0.0000000 0.0000000 0.0000000) to (21.724000 21.724000 21.724000)
-  1 by 2 by 2 MPI processor grid
+  1 by 1 by 1 MPI processor grid
  pair style sw stores no restart info
  512 atoms
-  read_restart CPU = 0.001 seconds
+  read_restart CPU = 0.000 seconds

 pair_style      vashishta
-pair_coeff 	* * InP.vashishta In In In In P P P P
+pair_coeff      * * InP.vashishta In In In In P P P P
 Reading vashishta potential file InP.vashishta with DATE: 2015-10-14

 thermo_style    custom step temp epair etotal econserve press
@ -234,7 +235,7 @@ Neighbor list info ...
      pair build: full/bin/atomonly
      stencil: full/bin/3d
      bin: standard
-Per MPI rank memory allocation (min/avg/max) = 2.988 | 2.988 | 2.988 Mbytes
+Per MPI rank memory allocation (min/avg/max) = 3.025 | 3.025 | 3.025 Mbytes
 Step Temp E_pair TotEng Econserve Press 
     100    1112.9699   -1497.2988    -1423.785   -1435.9207    355619.19 
     110     1250.545   -1504.5795   -1421.9785   -1435.9786    345188.52 
@ -247,29 +248,29 @@ Step Temp E_pair TotEng Econserve Press
     180    1302.9041   -1491.7765   -1405.7172   -1435.8971    249514.04 
     190    1332.3326   -1491.5271    -1403.524   -1435.9213    227537.99 
     200    1352.1813   -1490.4513   -1401.1371   -1435.9049    207626.42 
-Loop time of 0.14468 on 4 procs for 100 steps with 512 atoms
+Loop time of 0.117875 on 1 procs for 100 steps with 512 atoms

-Performance: 59.718 ns/day, 0.402 hours/ns, 691.179 timesteps/s
-81.2% CPU use with 4 MPI tasks x 1 OpenMP threads
+Performance: 73.298 ns/day, 0.327 hours/ns, 848.357 timesteps/s
+99.6% CPU use with 1 MPI tasks x 1 OpenMP threads

 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.047903   | 0.058669   | 0.086091   |   6.6 | 40.55
-Neigh   | 0.0027876  | 0.002852   | 0.0028808  |   0.1 |  1.97
-Comm    | 0.034642   | 0.066142   | 0.078599   |   7.1 | 45.72
-Output  | 0.00018477 | 0.0049147  | 0.019101   |  11.7 |  3.40
-Modify  | 0.0015709  | 0.0022651  | 0.0029545  |   1.4 |  1.57
-Other   |            | 0.009837   |            |       |  6.80
+Pair    | 0.11085    | 0.11085    | 0.11085    |   0.0 | 94.04
+Neigh   | 0.005235   | 0.005235   | 0.005235   |   0.0 |  4.44
+Comm    | 0.00077152 | 0.00077152 | 0.00077152 |   0.0 |  0.65
+Output  | 7.9155e-05 | 7.9155e-05 | 7.9155e-05 |   0.0 |  0.07
+Modify  | 0.00065637 | 0.00065637 | 0.00065637 |   0.0 |  0.56
+Other   |            | 0.0002811  |            |       |  0.24

-Nlocal:        128.000 ave         131 max         124 min
-Histogram: 1 0 0 0 0 1 0 1 0 1
-Nghost:        1013.25 ave        1025 max        1002 min
-Histogram: 1 1 0 0 0 0 0 0 1 1
+Nlocal:        512.000 ave         512 max         512 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        1838.00 ave        1838 max        1838 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
 Neighs:         0.00000 ave           0 max           0 min
-Histogram: 4 0 0 0 0 0 0 0 0 0
-FullNghs:      9120.50 ave        9356 max        8868 min
-Histogram: 1 0 0 0 1 0 1 0 0 1
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:      36482.0 ave       36482 max       36482 min
+Histogram: 1 0 0 0 0 0 0 0 0 0

 Total # of neighbors = 36482
 Ave neighs/atom = 71.253906
@ -280,28 +281,28 @@ Dangerous builds = 0

 clear
  using 1 OpenMP thread(s) per MPI task
-read_restart	restart.equil
+read_restart    restart.equil
 Reading restart file ...
-  restart file = 24 Dec 2020, LAMMPS = 24 Dec 2020
+  restart file = 29 Sep 2021, LAMMPS = 29 Sep 2021
  restoring atom style atomic from restart
  orthogonal box = (0.0000000 0.0000000 0.0000000) to (21.724000 21.724000 21.724000)
-  1 by 2 by 2 MPI processor grid
+  1 by 1 by 1 MPI processor grid
  pair style sw stores no restart info
  512 atoms
-  read_restart CPU = 0.005 seconds
+  read_restart CPU = 0.000 seconds

-variable	fac equal 0.6
-change_box 	all x scale ${fac} y scale ${fac} z scale ${fac} remap
-change_box 	all x scale 0.6 y scale ${fac} z scale ${fac} remap
-change_box 	all x scale 0.6 y scale 0.6 z scale ${fac} remap
-change_box 	all x scale 0.6 y scale 0.6 z scale 0.6 remap
+variable        fac equal 0.6
+change_box      all x scale ${fac} y scale ${fac} z scale ${fac} remap
+change_box      all x scale 0.6 y scale ${fac} z scale ${fac} remap
+change_box      all x scale 0.6 y scale 0.6 z scale ${fac} remap
+change_box      all x scale 0.6 y scale 0.6 z scale 0.6 remap
 Changing box ...
  orthogonal box = (4.3448000 0.0000000 0.0000000) to (17.379200 21.724000 21.724000)
  orthogonal box = (4.3448000 4.3448000 0.0000000) to (17.379200 17.379200 21.724000)
  orthogonal box = (4.3448000 4.3448000 4.3448000) to (17.379200 17.379200 17.379200)

 pair_style      tersoff
-pair_coeff 	* * BNC.tersoff N N N C B B C B
+pair_coeff      * * BNC.tersoff N N N C B B C B
 Reading tersoff potential file BNC.tersoff with DATE: 2013-03-21

 thermo_style    custom step temp epair etotal econserve press
@ -328,7 +329,7 @@ Neighbor list info ...
      pair build: full/bin/atomonly
      stencil: full/bin/3d
      bin: standard
-Per MPI rank memory allocation (min/avg/max) = 2.948 | 2.948 | 2.948 Mbytes
+Per MPI rank memory allocation (min/avg/max) = 2.982 | 2.982 | 2.982 Mbytes
 Step Temp E_pair TotEng Econserve Press 
     100    1112.9699   -3259.7676   -3186.2538   -3198.3895    1912461.3 
     110    1772.8268   -3301.5479   -3184.4493   -3198.8218    1885295.6 
@ -341,33 +342,128 @@ Step Temp E_pair TotEng Econserve Press
     180    1337.4358   -3254.9844   -3166.6442   -3196.8222    1880420.9 
     190    1441.8052   -3259.0364   -3163.8023   -3196.3556    1904512.1 
     200    1569.0317   -3265.0089   -3161.3714   -3196.3328    1899462.7 
-Loop time of 0.348631 on 4 procs for 100 steps with 512 atoms
+Loop time of 0.098053 on 1 procs for 100 steps with 512 atoms

-Performance: 24.783 ns/day, 0.968 hours/ns, 286.836 timesteps/s
-81.0% CPU use with 4 MPI tasks x 1 OpenMP threads
+Performance: 88.116 ns/day, 0.272 hours/ns, 1019.857 timesteps/s
+99.8% CPU use with 1 MPI tasks x 1 OpenMP threads

 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.13281    | 0.15657    | 0.20106    |   6.9 | 44.91
-Neigh   | 0.00037527 | 0.00039309 | 0.00040412 |   0.0 |  0.11
-Comm    | 0.12177    | 0.16672    | 0.19154    |   6.8 | 47.82
-Output  | 0.00019097 | 0.000462   | 0.0012722  |   0.0 |  0.13
-Modify  | 0.018353   | 0.020198   | 0.02302    |   1.3 |  5.79
-Other   |            | 0.004286   |            |       |  1.23
+Pair    | 0.096055   | 0.096055   | 0.096055   |   0.0 | 97.96
+Neigh   | 0.00079703 | 0.00079703 | 0.00079703 |   0.0 |  0.81
+Comm    | 0.00034523 | 0.00034523 | 0.00034523 |   0.0 |  0.35
+Output  | 6.8903e-05 | 6.8903e-05 | 6.8903e-05 |   0.0 |  0.07
+Modify  | 0.00060797 | 0.00060797 | 0.00060797 |   0.0 |  0.62
+Other   |            | 0.0001793  |            |       |  0.18

-Nlocal:        128.000 ave         132 max         123 min
-Histogram: 1 0 0 0 0 1 1 0 0 1
-Nghost:        529.500 ave         533 max         524 min
-Histogram: 1 0 0 0 0 0 1 1 0 1
+Nlocal:        512.000 ave         512 max         512 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        1028.00 ave        1028 max        1028 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
 Neighs:         0.00000 ave           0 max           0 min
-Histogram: 4 0 0 0 0 0 0 0 0 0
-FullNghs:      3651.00 ave        3783 max        3494 min
-Histogram: 1 0 0 0 0 1 1 0 0 1
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:      14604.0 ave       14604 max       14604 min
+Histogram: 1 0 0 0 0 0 0 0 0 0

 Total # of neighbors = 14604
 Ave neighs/atom = 28.523438
 Neighbor list builds = 1
 Dangerous builds = 0

+# Test Tersoff/ZBL model for SiC
+
+clear
+  using 1 OpenMP thread(s) per MPI task
+read_restart    restart.equil
+Reading restart file ...
+  restart file = 29 Sep 2021, LAMMPS = 29 Sep 2021
+  restoring atom style atomic from restart
+  orthogonal box = (0.0000000 0.0000000 0.0000000) to (21.724000 21.724000 21.724000)
+  1 by 1 by 1 MPI processor grid
+  pair style sw stores no restart info
+  512 atoms
+  read_restart CPU = 0.000 seconds
+
+variable        fac equal 0.6
+change_box      all x scale ${fac} y scale ${fac} z scale ${fac} remap
+change_box      all x scale 0.6 y scale ${fac} z scale ${fac} remap
+change_box      all x scale 0.6 y scale 0.6 z scale ${fac} remap
+change_box      all x scale 0.6 y scale 0.6 z scale 0.6 remap
+Changing box ...
+  orthogonal box = (4.3448000 0.0000000 0.0000000) to (17.379200 21.724000 21.724000)
+  orthogonal box = (4.3448000 4.3448000 0.0000000) to (17.379200 17.379200 21.724000)
+  orthogonal box = (4.3448000 4.3448000 4.3448000) to (17.379200 17.379200 17.379200)
+
+pair_style      tersoff/zbl
+pair_coeff      * * SiC.tersoff.zbl C C C C Si Si Si Si
+Reading tersoff/zbl potential file SiC.tersoff.zbl with DATE: 2009-04-15
+
+thermo_style    custom step temp epair etotal econserve press
+thermo          10
+fix             1 all nvt temp $t $t 0.1
+fix             1 all nvt temp 1800 $t 0.1
+fix             1 all nvt temp 1800 1800 0.1
+Resetting global fix info from restart file:
+  fix style: nvt, fix ID: 1
+timestep        1.0e-3
+neighbor        1.0 bin
+neigh_modify    every 1 delay 10 check yes
+run             100
+All restart file global fix info was re-assigned
+Neighbor list info ...
+  update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 4
+  ghost atom cutoff = 4
+  binsize = 2, bins = 7 7 7
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair tersoff/zbl, perpetual
+      attributes: full, newton on
+      pair build: full/bin/atomonly
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 3.002 | 3.002 | 3.002 Mbytes
+Step Temp E_pair TotEng Econserve Press 
+     100    1112.9699    7067.9634    7141.4772    7129.3415     17683957 
+     110     1676.669    7033.1458     7143.893    7128.6921     17837566 
+     120    2450.2667    6982.2491     7144.094    7126.9524     18220027 
+     130    2726.9659    6964.1219    7144.2432    7126.7678     18230324 
+     140     2729.421    6962.7393    7143.0228    7127.2074     18176317 
+     150    2738.5449    6959.1761    7140.0623    7127.6671     18068370 
+     160    2687.2419    6958.1183    7135.6158    7127.8492     18156214 
+     170    2697.7325    6952.1482    7130.3387    7127.7898     17978251 
+     180    2577.9885    6954.5611    7124.8422    7127.5615     18068920 
+     190    2502.6928    6954.4558    7119.7635      7127.67     18049652 
+     200    2517.4866     6947.962    7114.2469    7127.1972     18209451 
+Loop time of 0.810948 on 1 procs for 100 steps with 512 atoms
+
+Performance: 10.654 ns/day, 2.253 hours/ns, 123.312 timesteps/s
+99.9% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.8082     | 0.8082     | 0.8082     |   0.0 | 99.66
+Neigh   | 0.001195   | 0.001195   | 0.001195   |   0.0 |  0.15
+Comm    | 0.00054765 | 0.00054765 | 0.00054765 |   0.0 |  0.07
+Output  | 0.0001018  | 0.0001018  | 0.0001018  |   0.0 |  0.01
+Modify  | 0.00062656 | 0.00062656 | 0.00062656 |   0.0 |  0.08
+Other   |            | 0.0002768  |            |       |  0.03
+
+Nlocal:        512.000 ave         512 max         512 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        1542.00 ave        1542 max        1542 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:         0.00000 ave           0 max           0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:      30142.0 ave       30142 max       30142 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 30142
+Ave neighs/atom = 58.871094
+Neighbor list builds = 1
+Dangerous builds = 0
+
+shell rm restart.equil
 Total wall time: 0:00:01
--- a/lib/colvars/colvarmodule.cpp
+++ b/lib/colvars/colvarmodule.cpp
@ -1476,7 +1476,9 @@ int colvarmodule::write_output_files()
       bi != biases.end();
       bi++) {
    // Only write output files if they have not already been written this time step
-    if ((*bi)->output_freq == 0 || (cvm::step_absolute() % (*bi)->output_freq) != 0) {
+    if ((*bi)->output_freq == 0    ||
+        cvm::step_relative() == 0  ||
+        (cvm::step_absolute() % (*bi)->output_freq) != 0) {
      error_code |= (*bi)->write_output_files();
    }
    error_code |= (*bi)->write_state_to_replicas();
--- a/lib/colvars/colvars_version.h
+++ b/lib/colvars/colvars_version.h
@ -1,3 +1,3 @@
 #ifndef COLVARS_VERSION
-#define COLVARS_VERSION "2021-08-06"
+#define COLVARS_VERSION "2021-09-21"
 #endif
--- a/lib/gpu/Makefile.cuda_mps
+++ b/lib/gpu/Makefile.cuda_mps
@ -1,5 +1,5 @@
 # /* ----------------------------------------------------------------------
-#  Generic Linux Makefile for CUDA
+#  Generic Linux Makefile for CUDA with the Multi-Process Service (MPS)
 #     - change CUDA_ARCH for your GPU
 # ------------------------------------------------------------------------- */

--- a/lib/gpu/Makefile.hip
+++ b/lib/gpu/Makefile.hip
@ -39,11 +39,9 @@ HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform)
 HIP_COMPILER=$(shell $(HIP_PATH)/bin/hipconfig --compiler)

 ifeq (hcc,$(HIP_PLATFORM))
-	HIP_OPTS  += -ffast-math
 	# possible values: gfx803,gfx900,gfx906
 	HIP_ARCH = gfx906
 else ifeq (amd,$(HIP_PLATFORM))
-	HIP_OPTS  += -ffast-math
 	# possible values: gfx803,gfx900,gfx906
 	HIP_ARCH = gfx906
 else ifeq (nvcc,$(HIP_PLATFORM))
--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@ -1,5 +1,5 @@
 # /* ----------------------------------------------------------------------   
-#  Generic Linux Makefile for CUDA 
+#  Generic Linux Makefile for CUDA
 #     - Change CUDA_ARCH for your GPU
 # ------------------------------------------------------------------------- */

@ -13,7 +13,7 @@ endif

 NVCC = nvcc

-# obsolete hardware. not supported by current drivers anymore.
+# obsolete hardware. not supported by current drivers and toolkits anymore.
 #CUDA_ARCH = -arch=sm_13
 #CUDA_ARCH = -arch=sm_10 -DCUDA_PRE_THREE

@ -28,11 +28,11 @@ NVCC = nvcc
 #CUDA_ARCH = -arch=sm_37

 # Maxwell hardware
-CUDA_ARCH = -arch=sm_50
+#CUDA_ARCH = -arch=sm_50
 #CUDA_ARCH = -arch=sm_52

 # Pascal hardware
-#CUDA_ARCH = -arch=sm_60
+CUDA_ARCH = -arch=sm_60
 #CUDA_ARCH = -arch=sm_61

 # Volta hardware
@ -70,7 +70,7 @@ LIB_DIR = ./
 AR = ar
 BSH = /bin/sh

-# GPU binning not recommended with modern GPUs
+# GPU binning not recommended for most modern GPUs
 CUDPP_OPT = #-DUSE_CUDPP -Icudpp_mini

 include Nvidia.makefile
--- a/lib/gpu/Makefile.linux_multi
+++ b/lib/gpu/Makefile.linux_multi
@ -1,6 +1,6 @@
 # /* ----------------------------------------------------------------------   
-#  Generic Linux Makefile for CUDA 
-#     - Change CUDA_ARCH for your GPU
+#  Generic Linux Makefile for CUDA complied for multiple compute capabilities
+#     - Add your GPU to CUDA_CODE
 # ------------------------------------------------------------------------- */

 # which file will be copied to Makefile.lammps
--- a/lib/gpu/Makefile.mpi
+++ b/lib/gpu/Makefile.mpi
@ -0,0 +1 @@
+Makefile.linux
--- a/lib/gpu/Makefile.serial
+++ b/lib/gpu/Makefile.serial
@ -1,5 +1,5 @@
 # /* ----------------------------------------------------------------------   
-#  Generic Linux Makefile for CUDA 
+#  Generic Linux Makefile for CUDA without MPI libraries
 #     - Change CUDA_ARCH for your GPU
 # ------------------------------------------------------------------------- */

@ -28,11 +28,11 @@ NVCC = nvcc
 #CUDA_ARCH = -arch=sm_37

 # Maxwell hardware
-CUDA_ARCH = -arch=sm_50
+#CUDA_ARCH = -arch=sm_50
 #CUDA_ARCH = -arch=sm_52

 # Pascal hardware
-#CUDA_ARCH = -arch=sm_60
+CUDA_ARCH = -arch=sm_60
 #CUDA_ARCH = -arch=sm_61

 # Volta hardware
@ -41,6 +41,10 @@ CUDA_ARCH = -arch=sm_50
 # Turing hardware
 #CUDA_ARCH = -arch=sm_75

+# Ampere hardware
+#CUDA_ARCH = -arch=sm_80
+#CUDA_ARCH = -arch=sm_86
+
 # this setting should match LAMMPS Makefile
 # one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL

--- a/lib/gpu/Makefile.turing
+++ b/lib/gpu/Makefile.turing
@ -1,23 +0,0 @@
-NVCC  = $(CUDA_HOME)/bin/nvcc
-EXTRAMAKE = Makefile.lammps.standard
-
-CUDA_ARCH = -arch=sm_75
-CUDA_PRECISION = -D_SINGLE_DOUBLE
-CUDA_INCLUDE = -I$(CUDA_HOME)/include
-CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64 -lcudart
-CUDA_OPTS = -DUNIX -O3 --use_fast_math --ftz=true
-
-CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include
-CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON -DLAMMPS_SMALLBIG
-
-BIN_DIR = .
-OBJ_DIR = obj
-LIB_DIR = .
-AR = ar
-BSH = /bin/sh
-
-# GPU binning not recommended with most modern GPUs
-CUDPP_OPT = #-DUSE_CUDPP -Icudpp_mini
-
-include Nvidia.makefile
-
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@ -556,16 +556,22 @@ void UCL_Device::add_properties(cl_device_id device_list) {
                               sizeof(float_width),&float_width,nullptr));
  op.preferred_vector_width32=float_width;

-  // Determine if double precision is supported
  cl_uint double_width;
  CL_SAFE_CALL(clGetDeviceInfo(device_list,
                               CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
                               sizeof(double_width),&double_width,nullptr));
  op.preferred_vector_width64=double_width;
-  if (double_width==0)
-    op.double_precision=false;
-  else
+
+  // Determine if double precision is supported: All bits in the mask must be set.
+  cl_device_fp_config double_mask = (CL_FP_FMA|CL_FP_ROUND_TO_NEAREST|CL_FP_ROUND_TO_ZERO|
+                                     CL_FP_ROUND_TO_INF|CL_FP_INF_NAN|CL_FP_DENORM);
+  cl_device_fp_config double_avail;
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_DOUBLE_FP_CONFIG,
+                               sizeof(double_avail),&double_avail,nullptr));
+  if ((double_avail & double_mask) == double_mask)
    op.double_precision=true;
+  else
+    op.double_precision=false;

  CL_SAFE_CALL(clGetDeviceInfo(device_list,
                               CL_DEVICE_PROFILING_TIMER_RESOLUTION,
--- a/lib/gpu/lal_atom.cu
+++ b/lib/gpu/lal_atom.cu
@ -18,7 +18,7 @@
 #endif

 __kernel void kernel_cast_x(__global numtyp4 *restrict x_type,
-                            const __global double *restrict x,
+                            const __global numtyp *restrict x,
                            const __global int *restrict type,
                            const int nall) {
  int ii=GLOBAL_ID_X;
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -475,7 +475,7 @@ class Atom {
  UCL_Vector<numtyp,numtyp> v;

  #ifdef GPU_CAST
-  UCL_Vector<double,double> x_cast;
+  UCL_Vector<numtyp,numtyp> x_cast;
  UCL_Vector<int,int> type_cast;
  #endif

--- a/lib/gpu/lal_born_coul_long.cpp
+++ b/lib/gpu/lal_born_coul_long.cpp
@ -34,7 +34,7 @@ BornCoulLongT::BornCoulLong() : BaseCharge<numtyp,acctyp>(),
 }

 template <class numtyp, class acctyp>
-BornCoulLongT::~BornCoulLongT() {
+BornCoulLongT::~BornCoulLong() {
  clear();
 }

--- a/lib/gpu/lal_born_coul_wolf.cpp
+++ b/lib/gpu/lal_born_coul_wolf.cpp
@ -34,7 +34,7 @@ BornCoulWolfT::BornCoulWolf() : BaseCharge<numtyp,acctyp>(),
 }

 template <class numtyp, class acctyp>
-BornCoulWolfT::~BornCoulWolfT() {
+BornCoulWolfT::~BornCoulWolf() {
  clear();
 }

--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@ -34,7 +34,7 @@ BuckCoulLongT::BuckCoulLong() : BaseCharge<numtyp,acctyp>(),
 }

 template <class numtyp, class acctyp>
-BuckCoulLongT::~BuckCoulLongT() {
+BuckCoulLongT::~BuckCoulLong() {
  clear();
 }

--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -333,6 +333,12 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
    gpu_barrier();
  }

+  // check if double precision support is available
+  #if defined(_SINGLE_DOUBLE) || defined(_DOUBLE_DOUBLE)
+  if (!gpu->double_precision())
+    return -16;
+  #endif
+
  // Setup auto bin size calculation for calls from atom::sort
  // - This is repeated in neighbor init with additional info
  if (_user_cell_size<0.0) {
@ -546,14 +552,9 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
    return -3;

  if (_user_cell_size<0.0) {
-    #ifndef LAL_USE_OLD_NEIGHBOR
-    _neighbor_shared.setup_auto_cell_size(true,cutoff,nbor->simd_size());
-    #else
    _neighbor_shared.setup_auto_cell_size(false,cutoff,nbor->simd_size());
-    #endif
  } else
-    _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,
-                                          nbor->simd_size());
+    _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,nbor->simd_size());
  nbor->set_cutoff(cutoff);

  return 0;
@ -1038,10 +1039,18 @@ Device<PRECISION,ACC_PRECISION> global_device;

 using namespace LAMMPS_AL;

-bool lmp_has_gpu_device()
+// check if a suitable GPU is present.
+// for mixed and double precision GPU library compilation
+// also the GPU needs to support double precision.
+bool lmp_has_compatible_gpu_device()
 {
  UCL_Device gpu;
-  return (gpu.num_platforms() > 0);
+  bool compatible_gpu = gpu.num_platforms() > 0;
+  #if defined(_SINGLE_DOUBLE) || defined(_DOUBLE_DOUBLE)
+  if (compatible_gpu && !gpu.double_precision(0))
+    compatible_gpu = false;
+  #endif
+  return compatible_gpu;
 }

 std::string lmp_gpu_device_info()
--- a/lib/gpu/lal_zbl.cu
+++ b/lib/gpu/lal_zbl.cu
@ -82,9 +82,9 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
                    const __global numtyp4 *restrict coeff1,
                    const __global numtyp4 *restrict coeff2,
                    const __global numtyp4 *restrict coeff3,
-                    const double cut_globalsq,
-                    const double cut_innersq,
-                    const double cut_inner,
+                    const numtyp cut_globalsq,
+                    const numtyp cut_innersq,
+                    const numtyp cut_inner,
                    const int lj_types,
                    const __global int *dev_nbor,
                    const __global int *dev_packed,
@ -174,9 +174,9 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
                         const __global numtyp4 *restrict coeff1_in,
                         const __global numtyp4 *restrict coeff2_in,
                         const __global numtyp4 *restrict coeff3_in,
-                         const double cut_globalsq,
-                         const double cut_innersq,
-                         const double cut_inner,
+                         const numtyp cut_globalsq,
+                         const numtyp cut_innersq,
+                         const numtyp cut_inner,
                         const __global int *dev_nbor,
                         const __global int *dev_packed,
                         __global acctyp4 *restrict ans,
--- a/lib/gpu/lal_zbl.h
+++ b/lib/gpu/lal_zbl.h
@ -67,9 +67,9 @@ class ZBL : public BaseAtomic<numtyp, acctyp> {
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;

-  double _cut_globalsq;
-  double _cut_innersq;
-  double _cut_inner;
+  numtyp _cut_globalsq;
+  numtyp _cut_innersq;
+  numtyp _cut_inner;

  /// Number of atom types
  int _lj_types;
--- a/lib/machdyn/Install.py
+++ b/lib/machdyn/Install.py
@ -17,11 +17,12 @@ parser = ArgumentParser(prog='Install.py',

 # settings

-version = '3.3.9'
+version = '3.4.0'
 tarball = "eigen.tar.gz"

 # known checksums for different Eigen versions. used to validate the download.
 checksums = { \
+              '3.4.0' : '4c527a9171d71a72a9d4186e65bea559', \
              '3.3.9' : '609286804b0f79be622ccf7f9ff2b660', \
              '3.3.7' : '9e30f67e8531477de4117506fe44669b' \
 }
@ -35,7 +36,7 @@ Syntax from src dir: make lib-smd args="-b"

 Syntax from lib dir: python Install.py -b
                 or: python Install.py -p /usr/include/eigen3"
-                 or: python Install.py -v 3.3.7 -b
+                 or: python Install.py -v 3.4.0 -b

 Example:

@ -77,7 +78,7 @@ if pathflag:
 if buildflag:
  print("Downloading Eigen ...")
  eigentar = os.path.join(homepath, tarball)
-  url = "https://gitlab.com/libeigen/eigen/-/archive/%s/eigen-%s.tar.gz" %  (version,version)
+  url = "https://download.lammps.org/thirdparty/eigen-%s.tar.gz" %  version
  geturl(url, eigentar)

  # verify downloaded archive integrity via md5 checksum, if known.
--- a/lib/pace/Makefile
+++ b/lib/pace/Makefile
@ -2,8 +2,8 @@ SHELL = /bin/sh

 # ------ FILES ------

-SRC_FILES = $(wildcard src/ML-PACE/*.cpp)
-SRC = $(filter-out src/ML-PACE/pair_pace.cpp, $(SRC_FILES))
+SRC_FILES = $(wildcard src/USER-PACE/*.cpp)
+SRC = $(filter-out src/USER-PACE/pair_pace.cpp, $(SRC_FILES))

 # ------ DEFINITIONS ------

@ -12,7 +12,7 @@ OBJ =   $(SRC:.cpp=.o)


 # ------ SETTINGS ------
-CXXFLAGS = -O3 -fPIC -Isrc/ML-PACE
+CXXFLAGS = -O3 -fPIC -Isrc/USER-PACE

 ARCHIVE =	ar
 ARCHFLAG =	-rc
--- a/lib/pace/Makefile.lammps
+++ b/lib/pace/Makefile.lammps
@ -1,3 +1,3 @@
-pace_SYSINC =-I../../lib/pace/src/ML-PACE
+pace_SYSINC =-I../../lib/pace/src/USER-PACE
 pace_SYSLIB = -L../../lib/pace/ -lpace
 pace_SYSPATH =
--- a/lib/plumed/Install.py
+++ b/lib/plumed/Install.py
@ -17,7 +17,7 @@ parser = ArgumentParser(prog='Install.py',

 # settings

-version = "2.7.1"
+version = "2.7.4"
 mode = "static"

 # help message
@ -51,9 +51,14 @@ checksums = { \
        '2.6.0' : '204d2edae58d9b10ba3ad460cad64191', \
        '2.6.1' : '89a9a450fc6025299fe16af235957163', \
        '2.6.3' : 'a9f8028fd74528c2024781ea1fdefeee', \
+        '2.6.5' : 'b67356f027e5c2747823b0422c3b0ec2', \
+        '2.6.6' : '6b470dcdce04c221ea42d8500b03c49b', \
        '2.7.0' : '95f29dd0c067577f11972ff90dfc7d12', \
        '2.7.1' : '4eac6a462ec84dfe0cec96c82421b8e8', \
        '2.7.2' : 'cfa0b4dd90a81c25d3302e8d97bfeaea', \
+        '2.7.3' : 'f00cc82edfefe6bb3df934911dbe32fb', \
+        '2.7.4' : 'f858e0b6aed173748fc85b6bc8a9dcb3', \
+        '2.8.0' : '489b23daba70da78cf0506cbc31689c6', \
        }

 # parse and process arguments
--- a/lib/qmmm/pwqmmm.c
+++ b/lib/qmmm/pwqmmm.c
@ -26,6 +26,8 @@
 #error "Unsupported QE coupling API. Want API version 1."
 #endif

+// we need to pass an MPI communicator to the LAMMPS library interface
+#define  LAMMPS_LIB_MPI
 #include "library.h"

 static const char delim[] = " \t\n\r";
@ -67,8 +69,8 @@ int main(int argc, char **argv)
 #if 1    // AK: temporary hack
    if ( qmmmcfg.nmm != 2 ) {
        if (me == 0) {
-            fprintf( stderr, "\n Error in the number of processors for MM code"
-            "\n for the time being only two processor are allowed\n");
+            fprintf( stderr, "\n Error in the number of processors for the MM code.\n"
+            " Currently only requesting 2 MM processors is allowed.\n");
        }
        MPI_Finalize();
        return -1;
--- a/potentials/BN.extep
+++ b/potentials/BN.extep
@ -1,4 +1,4 @@
-# DATE: 2017-11-28 CONTRIBUTOR: J.H. Los, J.M.H. Kroes CITATION: Los et al. Phys. Rev. B 96, 184108 (2017)
+# UNITS: metal DATE: 2017-11-28 CONTRIBUTOR: J.H. Los, J.M.H. Kroes CITATION: Los et al. Phys. Rev. B 96, 184108 (2017)

 # B and N mixture, parameterized for ExTeP potential

@ -9,15 +9,15 @@
 #   other quantities are unitless

 # format of a single entry (one or more lines):
-#I  J   K   m, gamma*, lambda3, c,         d,         h,      n,         gamma,      lambda2,   B,            R,   D,   lambda1,   A
-B   B   B   3  1.0     0.0      26617.3000 141.2000  -0.1300  1.1422470  0.01498959  2.5211820  2768.7363631  2.0  0.2  2.6857244  3376.3350735
-N   N   N   3  1.0     0.0         23.5000   3.7500  -0.4000  0.6650000  0.01925100  2.6272721  2563.5603417  2.0  0.2  2.8293093  2978.9527928
-B   B   N   3  1.0     0.0      26617.3000 141.2000  -0.1300  1.1422470  0.01498959  2.5211820  2768.7363631  2.0  0.2  2.6857244  3376.3350735
-N   N   B   3  1.0     0.0         23.5000   3.7500  -0.4000  0.6650000  0.01925100  2.6272721  2563.5603417  2.0  0.2  2.8293093  2978.9527928
-B   N   B   3  1.0   0.0d0  306.586555205d0  10.d0  -0.7218d0  0.6576543657d0  0.0027024851d0  2.69335d0  2595.6860833266d0  2.d0  0.2d0  2.95d0  3330.0655849887d0
-B   N   N   3  1.0   0.0d0  306.586555205d0  10.d0  -0.7218d0  0.6576543657d0  0.0027024851d0  2.69335d0  2595.6860833266d0  2.d0  0.2d0  2.95d0  3330.0655849887d0
-N   B   B   3  1.0   0.0d0  306.586555205d0  10.d0  -0.7218d0  0.6576543657d0  0.0027024851d0  2.69335d0  2595.6860833266d0  2.d0  0.2d0  2.95d0  3330.0655849887d0
-N   B   N   3  1.0   0.0d0  306.586555205d0  10.d0  -0.7218d0  0.6576543657d0  0.0027024851d0  2.69335d0  2595.6860833266d0  2.d0  0.2d0  2.95d0  3330.0655849887d0
+#I  J   K   m, gamma*, lambda3, c,         d,         h,      n,            gamma,        lambda2,   B,               R,   D,   lambda1,   A
+B   B   B   3  1.0     0.0      26617.3000 141.2000  -0.1300  1.1422470     0.01498959    2.5211820  2768.7363631     2.0  0.2  2.6857244  3376.3350735
+N   N   N   3  1.0     0.0         23.5000   3.7500  -0.4000  0.6650000     0.01925100    2.6272721  2563.5603417     2.0  0.2  2.8293093  2978.9527928
+B   B   N   3  1.0     0.0      26617.3000 141.2000  -0.1300  1.1422470     0.01498959    2.5211820  2768.7363631     2.0  0.2  2.6857244  3376.3350735
+N   N   B   3  1.0     0.0         23.5000   3.7500  -0.4000  0.6650000     0.01925100    2.6272721  2563.5603417     2.0  0.2  2.8293093  2978.9527928
+B   N   B   3  1.0     0.0   306.586555205  10.      -0.7218  0.6576543657  0.0027024851  2.69335    2595.6860833266  2.0  0.2  2.95       3330.0655849887
+B   N   N   3  1.0     0.0   306.586555205  10.      -0.7218  0.6576543657  0.0027024851  2.69335    2595.6860833266  2.0  0.2  2.95       3330.0655849887
+N   B   B   3  1.0     0.0   306.586555205  10.      -0.7218  0.6576543657  0.0027024851  2.69335    2595.6860833266  2.0  0.2  2.95       3330.0655849887
+N   B   N   3  1.0     0.0   306.586555205  10.      -0.7218  0.6576543657  0.0027024851  2.69335    2595.6860833266  2.0  0.2  2.95       3330.0655849887
 #
 #     1.9925	  Bicubic Splines Parameters
 #
--- a/potentials/CC.Lebedeva
+++ b/potentials/CC.Lebedeva
@ -1,12 +1,13 @@
-# DATE: 2018-11-28  UNITS: metal  CONTRIBUTOR: Zbigniew Koziol softquake@gmail.com CITATION: Z. Koziol et al.: https://arxiv.org/abs/1803.05162
+# DATE: 2021-11-04  UNITS: metal  CONTRIBUTOR: Zbigniew Koziol softquake@gmail.com CITATION: Z. Koziol et al.: https://arxiv.org/abs/1803.05162
 #
-# Lebedeva Potential. https://doi.org/10.1016/j.physe.2011.07.018
+# Lebedeva potential: https://doi.org/10.1039/C0CP02614J and https://doi.org/10.1016/j.physe.2011.07.018

 # Parameters must be in this order as here, otherwise their values may be changed.
+# Energies here are given in meV.
 # The last one, S, is convenient for scaling the potential amplitude. S is a multiplication factor for A, B, C
 #        A       B           C       z0     alpha   D1      D2       lambda1   lambda2     S
-# These are values according to Levedeva et al
-#C  C   10.510  11.6523.34  35.883  3.34    4.16   -0.86232 0.10049  0.48703   0.46445     1.0
+# These are values according to Lebedeva et al.: https://doi.org/10.1016/j.cplett.2012.03.082
+C  C   10.510  11.652  29.5  3.34    4.16   -0.86232 0.10049  0.48703   0.46445     1.0
 #
 # These are values by Z. Koziol et al.: https://arxiv.org/abs/1803.05162
-C  C    14.558  21.204      1.8     3.198   4.16   -0.862   0.10049  0.6       0.4         1.0
+C1  C1    14.558  21.204      1.8     3.198   4.16   -0.862   0.10049  0.6       0.4         1.0
--- a/python/lammps/core.py
+++ b/python/lammps/core.py
@ -122,6 +122,9 @@ class lammps(object):
                  for f in os.listdir(winpath)]):
      lib_ext = ".dll"
      modpath = winpath
+    elif any([f.startswith('liblammps') and f.endswith('.so')
+              for f in os.listdir(modpath)]):
+      lib_ext = ".so"
    else:
      import platform
      if platform.system() == "Darwin":
--- a/src/.gitignore
+++ b/src/.gitignore
@ -858,8 +858,6 @@
 /fix_ti_rs.h
 /fix_ti_spring.cpp
 /fix_ti_spring.h
-/fix_ttm.cpp
-/fix_ttm.h
 /fix_tune_kspace.cpp
 /fix_tune_kspace.h
 /fix_wall_body_polygon.cpp
@ -919,6 +917,7 @@
 /improper_ring.h
 /improper_umbrella.cpp
 /improper_umbrella.h
+/interlayer_taper.h
 /kissfft.h
 /lj_sdk_common.h
 /math_complex.h
@ -933,7 +932,6 @@
 /msm_cg.h
 /neb.cpp
 /neb.h
-
 /pair_adp.cpp
 /pair_adp.h
 /pair_agni.cpp
@ -994,6 +992,8 @@
 /pair_cosine_squared.h
 /pair_coul_diel.cpp
 /pair_coul_diel.h
+/pair_coul_exclude.cpp
+/pair_coul_exclude.h
 /pair_coul_long.cpp
 /pair_coul_long.h
 /pair_coul_msm.cpp
@ -1431,6 +1431,10 @@
 /fix_srp.h
 /fix_tfmc.cpp
 /fix_tfmc.h
+/fix_ttm.cpp
+/fix_ttm.h
+/fix_ttm_grid.cpp
+/fix_ttm_grid.h
 /fix_ttm_mod.cpp
 /fix_ttm_mod.h
 /pair_born_coul_long_cs.cpp
--- a/src/DRUDE/fix_drude_transform.cpp
+++ b/src/DRUDE/fix_drude_transform.cpp
@ -13,16 +13,18 @@
 ------------------------------------------------------------------------- */

 /** Fix Drude Transform ******************************************************/
+
 #include "fix_drude_transform.h"

+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "fix_drude.h"
+#include "modify.h"
+
 #include <cmath>
 #include <cstring>
-#include "fix_drude.h"
-#include "atom.h"
-#include "domain.h"
-#include "comm.h"
-#include "error.h"
-#include "modify.h"

 using namespace LAMMPS_NS;
 using namespace FixConst;
--- a/src/DRUDE/fix_drude_transform.h
+++ b/src/DRUDE/fix_drude_transform.h
@ -25,10 +25,10 @@ FixStyle(drude/transform/inverse,FixDrudeTransform<true>);

 namespace LAMMPS_NS {

-template <bool inverse> class FixDrudeTransform : public Fix {
+template <bool inverse> class FixDrudeTransform: public Fix {
 public:
-  FixDrudeTransform<inverse>(class LAMMPS *, int, char **);
-  ~FixDrudeTransform<inverse>();
+  FixDrudeTransform(class LAMMPS *, int, char **);
+  ~FixDrudeTransform();
  int setmask();
  void init();
  void setup(int vflag);
--- a/src/EXTRA-COMPUTE/compute_temp_rotate.h
+++ b/src/EXTRA-COMPUTE/compute_temp_rotate.h
@ -43,7 +43,6 @@ class ComputeTempRotate : public Compute {
  double memory_usage();

 private:
-  int fix_dof;
  double tfactor, masstotal;
  double **vbiasall;    // stored velocity bias for all atoms
  int maxbias;          // size of vbiasall array
--- a/Show More
+++ b/Show More