Merge branch 'develop' into kk_pace_opt

# Conflicts: # src/ML-PACE/compute_pace.cpp
2024-01-04 21:08:50 -05:00
parent 4fbb913425 0e2a065f6b
commit 9d95c51b0b
93 changed files with 1100 additions and 631 deletions
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@ -35,7 +35,7 @@ jobs:
        python-version: '3.x'

    - name: Initialize CodeQL
-      uses: github/codeql-action/init@v2
+      uses: github/codeql-action/init@v3
      with:
        languages: ${{ matrix.language }}
        config-file: ./.github/codeql/${{ matrix.language }}.yml
@ -55,4 +55,4 @@ jobs:
        cmake --build . --parallel 2

    - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v2
+      uses: github/codeql-action/analyze@v3
--- a/cmake/presets/oneapi.cmake
+++ b/cmake/presets/oneapi.cmake
@ -18,11 +18,11 @@ set(MPI_CXX_COMPILER "mpicxx" CACHE STRING "" FORCE)

 unset(HAVE_OMP_H_INCLUDE CACHE)
 set(OpenMP_C "icx" CACHE STRING "" FORCE)
-set(OpenMP_C_FLAGS "-qopenmp -qopenmp-simd" CACHE STRING "" FORCE)
+set(OpenMP_C_FLAGS "-qopenmp;-qopenmp-simd" CACHE STRING "" FORCE)
 set(OpenMP_C_LIB_NAMES "omp" CACHE STRING "" FORCE)
 set(OpenMP_CXX "icpx" CACHE STRING "" FORCE)
-set(OpenMP_CXX_FLAGS "-qopenmp -qopenmp-simd" CACHE STRING "" FORCE)
+set(OpenMP_CXX_FLAGS "-qopenmp;-qopenmp-simd" CACHE STRING "" FORCE)
 set(OpenMP_CXX_LIB_NAMES "omp" CACHE STRING "" FORCE)
-set(OpenMP_Fortran_FLAGS "-qopenmp -qopenmp-simd" CACHE STRING "" FORCE)
+set(OpenMP_Fortran_FLAGS "-qopenmp;-qopenmp-simd" CACHE STRING "" FORCE)
 set(OpenMP_omp_LIBRARY "libiomp5.so" CACHE PATH "" FORCE)

--- a/doc/github-development-workflow.md
+++ b/doc/github-development-workflow.md
@ -36,10 +36,10 @@ requests.
 MUST be submitted as a pull request to GitHub.  All changes to the
 "develop" branch must be made exclusively through merging pull requests.
 The "release" and "stable" branches, respectively, are only to be
-updated upon feature or stable releases based on the associated
-tags.  Updates to the stable release in between stable releases
+updated upon "feature releases" or "stable releases" based on the
+associated tags.  Updates to the stable release in between stable releases
 (for example, back-ported bug fixes) are first merged into the "maintenance"
-branch and then into the "stable" branch as update releases.
+branch and then into the "stable" branch as "stable update releases".

 Pull requests may also be submitted to (long-running) feature branches
 created by LAMMPS developers inside the LAMMPS project, if needed. Those
@ -131,7 +131,7 @@ testing -- that the code in the branch "develop" does not get easily
 broken.  These tests are run after every update to a pull request.  More
 extensive and time-consuming tests (including regression testing) are
 performed after code is merged to the "develop" branch.  There are feature
-releases of LAMMPS made about every 4-6 weeks at a point, when the LAMMPS
+releases of LAMMPS made about every 4-8 weeks at a point, when the LAMMPS
 developers feel, that a sufficient number of changes have been included
 and all post-merge testing has been successful.  These feature releases are
 marked with a `patch_<version date>` tag and the "release" branch
--- a/doc/graphviz/Makefile
+++ b/doc/graphviz/Makefile
@ -16,8 +16,11 @@ clean:
 	rm -f $(IMGSVG) $(IMGPDF) $(IMGPNG) *~

 ifeq ($(HAS_DOT),YES)
-$(IMGDIR)/%.png: %.dot
+$(IMGDIR)/lammps-classes.png : lammps-classes.dot
 	dot -Tpng -Kneato -o $@ $<
+
+$(IMGDIR)/%.png: %.dot
+	dot -Tpng -Kdot -o $@ $<
 endif

 ifeq ($(HAS_DOT),NO)
--- a/doc/graphviz/lammps-releases.dot
+++ b/doc/graphviz/lammps-releases.dot
@ -0,0 +1,34 @@
+// LAMMPS branches and releases
+digraph releases {
+    rankdir="LR";
+    github [shape="box" label="Pull Requests\non GitHub" height=0.75];
+    github -> develop [label="Merge commits"];
+    {
+        rank = "same";
+        work [shape="none" label="Development branches:"]
+        develop [label="'develop' branch" height=0.75];
+        maintenance [label="'maintenance' branch" height=0.75];
+    };
+    {
+        rank = "same";
+        upload [shape="none" label="Release branches:"]
+        release [label="'release' branch" height=0.75];
+        stable [label="'stable' branch" height=0.75];
+    };
+    develop -> release [label="Feature release\n(every 4-8 weeks)"];
+    release -> stable [label="Stable release\n(once per year)"];
+    stable -> maintenance [label="Reset on stable release" style="setlinewidth(2)"];
+    develop -> maintenance [label="Backports of bugfixes" style="dashed"];
+    maintenance -> stable [label="Updates to stable release"];
+    {
+        rank = "same";
+        tag [shape="none" label="Applied tags:"];
+        patchtag [shape="box" label="patch_<date>"];
+        stabletag [shape="box" label="stable_<date>"];
+        updatetag [shape="box" label="stable_<date>_update<num>"];
+    };
+    release -> patchtag [label="feature release" style="dotted"];
+    stable -> stabletag [label="stable release" style="dotted"];
+    stable -> updatetag [label="update release" style="dotted"];
+}
+
--- a/doc/src/Build_development.rst
+++ b/doc/src/Build_development.rst
@ -255,16 +255,18 @@ A test run is then a a collection multiple individual test runs each
 with many comparisons to reference results based on template input
 files, individual command settings, relative error margins, and
 reference data stored in a YAML format file with ``.yaml``
-suffix. Currently the programs ``test_pair_style``, ``test_bond_style``, and
-``test_angle_style`` are implemented.  They will compare forces, energies and
-(global) stress for all atoms after a ``run 0`` calculation and after a
-few steps of MD with :doc:`fix nve <fix_nve>`, each in multiple variants
-with different settings and also for multiple accelerated styles. If a
-prerequisite style or package is missing, the individual tests are
-skipped.  All tests will be executed on a single MPI process, so using
-the CMake option ``-D BUILD_MPI=off`` can significantly speed up testing,
-since this will skip the MPI initialization for each test run.
-Below is an example command and output:
+suffix. Currently the programs ``test_pair_style``, ``test_bond_style``,
+``test_angle_style``, ``test_dihedral_style``, and
+``test_improper_style`` are implemented.  They will compare forces,
+energies and (global) stress for all atoms after a ``run 0`` calculation
+and after a few steps of MD with :doc:`fix nve <fix_nve>`, each in
+multiple variants with different settings and also for multiple
+accelerated styles. If a prerequisite style or package is missing, the
+individual tests are skipped.  All force style tests will be executed on
+a single MPI process, so using the CMake option ``-D BUILD_MPI=off`` can
+significantly speed up testing, since this will skip the MPI
+initialization for each test run.  Below is an example command and
+output:

 .. code-block:: console

@ -416,15 +418,16 @@ When compiling LAMMPS with enabled tests, most test executables will
 need to be linked against the LAMMPS library.  Since this can be a very
 large library with many C++ objects when many packages are enabled, link
 times can become very long on machines that use the GNU BFD linker (e.g.
-Linux systems).  Alternatives like the ``lld`` linker of the LLVM project
-or the ``gold`` linker available with GNU binutils can speed up this step
-substantially. CMake will by default test if any of the two can be
-enabled and use it when ``ENABLE_TESTING`` is active.  It can also be
-selected manually through the ``CMAKE_CUSTOM_LINKER`` CMake variable.
-Allowed values are ``lld``, ``gold``, ``bfd``, or ``default``.  The
-``default`` option will use the system default linker otherwise, the
-linker is chosen explicitly.  This option is only available for the
-GNU or Clang C++ compiler.
+Linux systems).  Alternatives like the ``mold`` linker, the ``lld``
+linker of the LLVM project, or the ``gold`` linker available with GNU
+binutils can speed up this step substantially (in this order).  CMake
+will by default test if any of the three can be enabled and use it when
+``ENABLE_TESTING`` is active.  It can also be selected manually through
+the ``CMAKE_CUSTOM_LINKER`` CMake variable.  Allowed values are
+``mold``, ``lld``, ``gold``, ``bfd``, or ``default``.  The ``default``
+option will use the system default linker otherwise, the linker is
+chosen explicitly.  This option is only available for the GNU or Clang
+C++ compilers.

 Tests for other components and utility functions
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
--- a/doc/src/Developer_platform.rst
+++ b/doc/src/Developer_platform.rst
@ -70,6 +70,9 @@ File and path functions and global constants
 .. doxygenfunction:: is_console
   :project: progguide

+.. doxygenfunction:: disk_free
+   :project: progguide
+
 .. doxygenfunction:: path_is_directory
   :project: progguide

--- a/doc/src/Developer_unittest.rst
+++ b/doc/src/Developer_unittest.rst
@ -121,7 +121,7 @@ will be suppressed and only a summary printed, but adding
 the '-V' option will then produce output from the tests
 above like the following:

-.. code-block::
+.. code-block:: console

   [...]
   1: [ RUN      ] Tokenizer.empty_string
@ -526,3 +526,102 @@ The ``unittest/tools`` folder contains tests for programs in the
 shell, which are implemented as a python scripts using the ``unittest``
 Python module and launching the tool commands through the ``subprocess``
 Python module.
+
+
+Troubleshooting failed unit tests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The are by default no unit tests for newly added features (e.g. pair, fix,
+or compute styles) unless your pull request also includes tests for the
+added features.  If you are modifying some features, you may see failures
+for existing tests, if your modifications have some unexpected side effects
+or your changes render the existing text invalid.  If you are adding an
+accelerated version of an existing style, then only tests for INTEL,
+KOKKOS (with OpenMP only), OPENMP, and OPT will be run automatically.
+Tests for the GPU package are time consuming and thus are only run
+*after* a merge, or when a special label, ``gpu_unit_tests`` is added
+to the pull request.  After the test has started, it is often best to
+remove the label since every PR activity will re-trigger the test (that
+is a limitation of triggering a test with a label).  Support for unit
+tests with using KOKKOS with GPU acceleration is currently not supported.
+
+When you see a failed build on GitHub, click on ``Details`` to be taken
+to the corresponding LAMMPS Jenkins CI web page.  Click on the "Exit"
+symbol near the ``Logout`` button on the top right of that page to go to
+the "classic view".  In the classic view, there is a list of the
+individual runs that make up this test run (they are shown but cannot be
+inspected in the default view).  You can click on any of those.
+Clicking on ``Test Result`` will display the list of failed tests. Click
+on the "Status" column to sort the tests based on their Failed or Passed
+status.  Then click on the failed test to expand its output.
+
+For example, the following output snippet shows the failed unit test
+
+.. code-block:: console
+
+   [ RUN      ] PairStyle.gpu
+   /home/builder/workspace/dev/pull_requests/ubuntu_gpu/unit_tests/cmake_gpu_opencl_mixed_smallbig_clang_static/unittest/force-styles/test_main.cpp:63: Failure
+   Expected: (err) <= (epsilon)
+   Actual: 0.00018957912910606503 vs 0.0001
+   Google Test trace:
+   /home/builder/workspace/dev/pull_requests/ubuntu_gpu/unit_tests/cmake_gpu_opencl_mixed_smallbig_clang_static/unittest/force-styles/test_main.cpp:56: EXPECT_FORCES: init_forces (newton off)
+   /home/builder/workspace/dev/pull_requests/ubuntu_gpu/unit_tests/cmake_gpu_opencl_mixed_smallbig_clang_static/unittest/force-styles/test_main.cpp:64: Failure
+   Expected: (err) <= (epsilon)
+   Actual: 0.00022892713393549854 vs 0.0001
+
+The failed assertions provide line numbers in the test source
+(e.g. ``test_main.cpp:56``), from which one can understand what
+specific assertion failed.
+
+Note that the force style engine runs one of a small number of systems
+in a rather off-equilibrium configuration with a few atoms for a few
+steps, writes data and restart files, uses :doc:`the clear command
+<clear>` to reset LAMMPS, and then runs from those files with different
+settings (e.g. newton on/off) and integrators (e.g. verlet vs. respa).
+Beyond potential issues/bugs in the source code, the mismatch between
+the expected and actual values could be that force arrays are not
+properly cleared between multiple run commands or that class members are
+not correctly initialized or written to or read from a data or restart
+file.
+
+While the epsilon (relative precision) for a single, `IEEE 754 compliant
+<https://en.wikipedia.org/wiki/IEEE_754>`_, double precision floating
+point operation is at about 2.2e-16, the achievable precision for the
+tests is lower due to most numbers being sums over intermediate results
+and the non-associativity of floating point math leading to larger
+errors.  In some cases specific properties of the tested style.  As a
+rule of thumb, the test epsilon can often be in the range 5.0e-14 to
+1.0e-13.  But for "noisy" force kernels, e.g. those a larger amount of
+arithmetic operations involving `exp()`, `log()` or `sin()` functions,
+and also due to the effect of compiler optimization or differences
+between compilers or platforms, epsilon may need to be further relaxed,
+sometimes epsilon can be relaxed to 1.0e-12. If interpolation or lookup
+tables are used, epsilon may need to be set to 1.0e-10 or even higher.
+For tests of accelerated styles, the per-test epsilon is multiplied
+by empirical factors that take into account the differences in the order
+of floating point operations or that some or most intermediate operations
+may be done using approximations or with single precision floating point
+math.
+
+To rerun the failed unit test individually, change to the ``build`` directory
+and run the test with verbose output. For example,
+
+.. code-block:: bash
+
+    env TEST_ARGS=-v ctest -R ^MolPairStyle:lj_cut_coul_long -V
+
+``ctest`` with the ``-V`` flag also shows the exact command line
+of the test. One can then use ``gdb --args`` to further debug and
+catch exceptions with the test command, for example,
+
+.. code-block:: bash
+
+    gdb --args /path/to/lammps/build/test_pair_style /path/to/lammps/unittest/force-styles/tests/mol-pair-lj_cut_coul_long.yaml
+
+
+It is recommended to configure the build with ``-D
+BUILD_SHARED_LIBS=on`` and use a custom linker to shorten the build time
+during recompilation.  Installing `ccache` in your development
+environment helps speed up recompilation by caching previous
+compilations and detecting when the same compilation is being done
+again.  Please see :doc:`Build_development` for further details.
--- a/doc/src/Howto_github.rst
+++ b/doc/src/Howto_github.rst
@ -480,11 +480,11 @@ Some recent changes to the workflow are not captured in this tutorial.
 For example, in addition to the *develop* branch, to which all new
 features should be submitted, there is also a *release*, a *stable*, and
 a *maintenance* branch; the *release* branch is updated from the
-*develop* as part of a feature release, and *stable* (together with
-*release*) are updated from *develop* when a stable release is made. In
-between stable releases, selected bug fixes and infrastructure updates
-are back-ported from the *develop* branch to the *maintenance* branch
-and occasionally merged to *stable* as an update release.
+*develop* branch as part of a "feature release", and *stable* (together
+with *release*) are updated from *develop* when a "stable release" is
+made. In between stable releases, selected bug fixes and infrastructure
+updates are back-ported from the *develop* branch to the *maintenance*
+branch and occasionally merged to *stable* as an update release.

 Furthermore, the naming of the release tags now follow the pattern
 "patch_<Day><Month><Year>" to simplify comparisons between releases.
--- a/doc/src/Install_git.rst
+++ b/doc/src/Install_git.rst
@ -28,16 +28,16 @@ provides `limited support for subversion clients <svn_>`_.

 You can follow the LAMMPS development on 4 different git branches:

-* **release**  :  this branch is updated with every patch or feature release;
-  updates are always "fast-forward" merges from *develop*
-* **develop**  :  this branch follows the ongoing development and
-  is updated with every merge commit of a pull request
-* **stable**   :  this branch is updated from the *release* branch with
-  every stable release version and also has selected bug fixes with every
-  update release when the *maintenance* branch is merged into it
-* **maintenance**  :  this branch collects back-ported bug fixes from the
-  *develop* branch to the *stable* branch. It is used to update *stable*
-  for update releases and it synchronized with *stable* at each stable release.
+* **develop** : this branch follows the ongoing development and is
+  updated with every merge commit of a pull request
+* **release** : this branch is updated with every "feature release";
+   updates are always "fast-forward" merges from *develop*
+* **maintenance** : this branch collects back-ported bug fixes from the
+  *develop* branch to the *stable* branch.  It is used to update the
+  *stable* branch for "stable update releases".
+* **stable** : this branch is updated from the *release* branch with
+  every "stable release" version and also has selected bug fixes with
+  every "update release" when the *maintenance* branch is merged into it

 To access the git repositories on your box, use the clone command to
 create a local copy of the LAMMPS repository with a command like:
--- a/doc/src/JPG/lammps-classes.png
+++ b/doc/src/JPG/lammps-classes.png
--- a/doc/src/JPG/lammps-invoke-python.png
+++ b/doc/src/JPG/lammps-invoke-python.png
--- a/doc/src/JPG/lammps-releases.png
+++ b/doc/src/JPG/lammps-releases.png
--- a/doc/src/JPG/pylammps-invoke-lammps.png
+++ b/doc/src/JPG/pylammps-invoke-lammps.png
--- a/doc/src/JPG/python-invoke-lammps.png
+++ b/doc/src/JPG/python-invoke-lammps.png
--- a/doc/src/Manual_version.rst
+++ b/doc/src/Manual_version.rst
@ -3,45 +3,25 @@ What does a LAMMPS version mean

 The LAMMPS "version" is the date when it was released, such as 1 May
 2014.  LAMMPS is updated continuously, and we aim to keep it working
-correctly and reliably at all times.  You can follow its development
-in a public `git repository on GitHub <https://github.com/lammps/lammps>`_.
-
-Modifications of the LAMMPS source code (like bug fixes, code refactors,
-updates to existing features, or addition of new features) are organized
-into pull requests.  Pull requests will be merged into the *develop*
-branch of the git repository after they pass automated testing and code
-review by the LAMMPS developers.  When a sufficient number of changes
-have accumulated *and* the *develop* branch version passes an extended
-set of automated tests, we release it as a *feature release*, which are
-currently made every 4 to 8 weeks.  The *release* branch of the git
-repository is updated with every such release.  A summary of the most
-important changes of the patch releases are on `this website page
-<https://www.lammps.org/bug.html>`_.  More detailed release notes are
-`available on GitHub <https://github.com/lammps/lammps/releases/>`_.
-
-Once or twice a year, we have a "stabilization period" where we apply
-only bug fixes and small, non-intrusive changes to the *develop*
-branch.  At the same time, the code is subjected to more detailed and
-thorough manual testing than the default automated testing.  Also,
-several variants of static code analysis are run to improve the overall
-code quality, consistency, and compliance with programming standards,
-best practices and style conventions.
-
-The release after such a stabilization period is called a *stable*
-version and both, the *release* and the *stable* branches are updated
-with it.  Between stable releases, we collect back-ported bug fixes and
-updates from the *develop* branch in the *maintenance* branch.  From the
-*maintenance* branch we make occasional update releases and update the
-*stable* branch accordingly.
+correctly and reliably at all times.  Also, several variants of static
+code analysis are run regularly to maintain or improve the overall code
+quality, consistency, and compliance with programming standards, best
+practices and style conventions.  You can follow its development in a
+public `git repository on GitHub <https://github.com/lammps/lammps>`_.

 Each version of LAMMPS contains all the documented *features* up to and
 including its version date.  For recently added features, we add markers
 to the documentation at which specific LAMMPS version a feature or
 keyword was added or significantly changed.

-The version date is printed to the screen and log file every time you run
-LAMMPS.  It is also in the file src/version.h and in the LAMMPS
-directory name created when you unpack a tarball.  And it is on the
+Identifying the Version
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The version date is printed to the screen and log file every time you
+run LAMMPS.  There also is an indication, if a LAMMPS binary was
+compiled from version with modifications **after** a release.
+It is also visible in the file src/version.h and in the LAMMPS directory
+name created when you unpack a downloaded tarball.  And it is on the
 first page of the :doc:`manual <Manual>`.

 * If you browse the HTML pages of the online version of the LAMMPS
@ -53,3 +33,56 @@ first page of the :doc:`manual <Manual>`.
 * If you browse the HTML pages included in your downloaded tarball, they
  describe the version you have, which may be older than the online
  version.
+
+LAMMPS releases, branches, and tags
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. figure:: JPG/lammps-releases.png
+   :figclass: align-center
+
+   Relations between releases, main branches, and tags in the LAMMPS git repository
+
+Development
+"""""""""""
+
+Modifications of the LAMMPS source code (like bug fixes, code
+refactoring, updates to existing features, or addition of new features)
+are organized into pull requests.  Pull requests will be merged into the
+*develop* branch of the git repository after they pass automated testing
+and code review by the LAMMPS developers.
+
+Feature Releases
+""""""""""""""""
+
+When a sufficient number of new features and updates have accumulated
+*and* the LAMMPS version on the *develop* branch passes an extended set
+of automated tests, we release it as a *feature release*, which are
+currently made every 4 to 8 weeks.  The *release* branch of the git
+repository is updated with every such *feature release* and a tag in the
+format ``patch_1May2014`` is added.  A summary of the most important
+changes of these releases for the current year are posted on `this
+website page <https://www.lammps.org/bug.html>`_.  More detailed release
+notes are `available on GitHub
+<https://github.com/lammps/lammps/releases/>`_.
+
+Stable Releases
+"""""""""""""""
+
+About once a year, we release a *stable release* version of LAMMPS.
+This is done after a "stabilization period" where we apply only bug
+fixes and small, non-intrusive changes to the *develop* branch but no
+new features.  At the same time, the code is subjected to more detailed
+and thorough manual testing than the default automated testing.
+After such a *stable release*, both the *release* and the *stable*
+branches are updated and two tags are applied, a ``patch_1May2014`` format
+and a ``stable_1May2014`` format tag.
+
+Stable Release Updates
+""""""""""""""""""""""
+
+Between *stable releases*, we collect bug fixes and updates back-ported
+from the *develop* branch in a branch called *maintenance*.  From the
+*maintenance* branch we make occasional *stable update releases* and
+update the *stable* branch accordingly.  The first update to the
+``stable_1May2014`` release would be tagged as
+``stable_1May2014_update1``.  These updates contain no new features.
--- a/doc/src/fix_halt.rst
+++ b/doc/src/fix_halt.rst
@ -183,4 +183,4 @@ Related commands
 Default
 """""""

-The option defaults are error = hard, message = yes, and path = ".".
+The option defaults are error = soft, message = yes, and path = ".".
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@ -344,12 +344,10 @@ specify additional flags for the runtime build.

 ----------

-The *intel* style invokes settings associated with the use of the
-INTEL package.  All of its settings, except the *omp* and *mode*
-keywords, are ignored if LAMMPS was not built with Xeon Phi
-co-processor support.  All of its settings, including the *omp* and
-*mode* keyword are applicable if LAMMPS was built with co-processor
-support.
+The *intel* style invokes settings associated with the use of the INTEL
+package.  The keywords *balance*, *ghost*, *tpc*, and *tptask* are
+**only** applicable if LAMMPS was built with Xeon Phi co-processor
+support and are otherwise ignored.

 The *Nphi* argument sets the number of co-processors per node.
 This can be set to any value, including 0, if LAMMPS was not
@ -474,13 +472,13 @@ If the *neigh/thread* keyword is set to *off*, then the KOKKOS package
 threads only over atoms. However, for small systems, this may not expose
 enough parallelism to keep a GPU busy. When this keyword is set to *on*,
 the KOKKOS package threads over both atoms and neighbors of atoms. When
-using *neigh/thread* *on*, a full neighbor list must also be used. Using
-*neigh/thread* *on* may be slower for large systems, so this this option
-is turned on by default only when there are 16K atoms or less owned by
-an MPI rank and when using a full neighbor list. Not all KOKKOS-enabled
-potentials support this keyword yet, and only thread over atoms. Many
-simple pairwise potentials such as Lennard-Jones do support threading
-over both atoms and neighbors.
+using *neigh/thread* *on*, the :doc:`newton pair <newton>` setting must
+be "off". Using *neigh/thread* *on* may be slower for large systems, so
+this this option is turned on by default only when running on one or
+more GPUs and there are 16k atoms or less owned by an MPI rank. Not all
+KOKKOS-enabled potentials support this keyword yet, and only thread over
+atoms. Many simple pairwise potentials such as Lennard-Jones do support
+threading over both atoms and neighbors.

 If the *neigh/transpose* keyword is set to *off*, then the KOKKOS
 package will use the same memory layout for building the neighbor list on
@ -732,7 +730,7 @@ comm = device, sort = device, neigh/transpose = off, gpu/aware = on. When
 LAMMPS can safely detect that GPU-aware MPI is not available, the default value
 of gpu/aware becomes "off". For CPUs or Xeon Phis, the option defaults are
 neigh = half, neigh/qeq = half, newton = on, binsize = 0.0, comm = no, and sort
-= no.  The option neigh/thread = on when there are 16K atoms or less on an MPI
+= no. For GPUs, option neigh/thread = on when there are 16k atoms or less on an MPI
 rank, otherwise it is "off". These settings are made automatically by the
 required "-k on" :doc:`command-line switch <Run_options>`. You can change them
 by using the package kokkos command in your input script or via the :doc:`-pk
--- a/doc/src/pair_aip_water_2dm.rst
+++ b/doc/src/pair_aip_water_2dm.rst
@ -22,13 +22,24 @@ Examples
 .. code-block:: LAMMPS

   pair_style  hybrid/overlay aip/water/2dm 16.0 1
-   pair_coeff  * * aip/water/2dm  COH.aip.water.2dm C Ow Hw
+   pair_coeff  * * aip/water/2dm  CBNOH.aip.water.2dm C Ow Hw

   pair_style  hybrid/overlay aip/water/2dm 16.0 lj/cut/tip4p/long 2 3 1 1 0.1546 10 8.5
-   pair_coeff  2 2   lj/cut/tip4p/long    8.0313e-3  3.1589  # O-O
-   pair_coeff  2 3   lj/cut/tip4p/long    0.0        0.0     # O-H
-   pair_coeff  3 3   lj/cut/tip4p/long    0.0        0.0     # H-H
-   pair_coeff  * *   aip/water/2dm        COH.aip.water.2dm    C Ow Hw
+   pair_coeff  2 2   lj/cut/tip4p/long    8.0313e-3  3.1589      # O-O
+   pair_coeff  2 3   lj/cut/tip4p/long    0.0        0.0         # O-H
+   pair_coeff  3 3   lj/cut/tip4p/long    0.0        0.0         # H-H
+   pair_coeff  * *   aip/water/2dm        CBNOH.aip.water.2dm    C Ow Hw
+
+   pair_style  hybrid/overlay aip/water/2dm 16.0 lj/cut/tip4p/long 3 4 1 1 0.1546 10 8.5 coul/shield 16.0 1
+   pair_coeff  1*2 1*2   none
+   pair_coeff  3 3   lj/cut/tip4p/long    8.0313e-3  3.1589      # O-O
+   pair_coeff  3 4   lj/cut/tip4p/long    0.0        0.0         # O-H
+   pair_coeff  4 4   lj/cut/tip4p/long    0.0        0.0         # H-H
+   pair_coeff  * *   aip/water/2dm        CBNOH.aip.water.2dm  B N Ow Hw
+   pair_coeff  1 3   coul/shield          1.333
+   pair_coeff  1 4   coul/shield          1.333
+   pair_coeff  2 3   coul/shield          1.333
+   pair_coeff  2 4   coul/shield          1.333

 Description
 """""""""""
@ -37,7 +48,7 @@ Description

 The *aip/water/2dm* style computes the anisotropic interfacial potential
 (AIP) potential for interfaces of water with two-dimensional (2D)
-materials as described in :ref:`(Feng) <Feng>`.
+materials as described in :ref:`(Feng1) <Feng1>` and :ref:`(Feng2) <Feng2>`.

 .. math::

@ -62,12 +73,12 @@ larger than :math:`r_c` :doc:`pair_style ilp_graphene_hbn
 .. note::

   This pair style uses the atomic normal vector definition from
-   :ref:`(Feng) <Feng>`), where the atomic normal vectors of the
+   :ref:`(Feng1) <Feng1>`), where the atomic normal vectors of the
   hydrogen atoms are assumed to lie along the corresponding
   oxygen-hydrogen bonds and the normal vector of the central oxygen
   atom is defined as their average.

-The provided parameter file, ``COH.aip.water.2dm``, is intended for use
+The provided parameter file, ``CBNOH.aip.water.2dm``, is intended for use
 with *metal* :doc:`units <units>`, with energies in meV.  Two additional
 parameters, *S*, and *rcut* are included in the parameter file. *S* is
 designed to facilitate scaling of energies; *rcut* is the cutoff for an
@ -77,7 +88,7 @@ the calculation of the normals for all atom pairs.
 .. note::

   The parameters presented in the provided parameter file,
-   ``COH.aip.water.2dm``, are fitted with the taper function enabled by
+   ``CBNOH.aip.water.2dm``, are fitted with the taper function enabled by
   setting the cutoff equal to 16.0 Angstrom.  Using a different cutoff
   or taper function setting should be carefully checked as they can
   lead to significant errors.  These parameters provide a good
@ -134,7 +145,7 @@ if LAMMPS was built with that package.  See the :doc:`Build package
 This pair style requires the newton setting to be *on* for pair
 interactions.

-The ``COH.aip.water.2dm`` potential file provided with LAMMPS is
+The ``CBNOH.aip.water.2dm`` potential file provided with LAMMPS is
 parameterized for *metal* units.  You can use this pair style with any
 LAMMPS units, but you would need to create your own potential file with
 parameters in the appropriate units, if your simulation does not use
@ -162,6 +173,10 @@ tap_flag = 1

 ----------

-.. _Feng:
+.. _Feng1:

-**(Feng)** Z. Feng and W. Ouyang et al., J. Phys. Chem. C. 127, 8704-8713 (2023).
+**(Feng1)** Z. Feng, ..., and W. Ouyang, J. Phys. Chem. C. 127(18), 8704-8713 (2023).
+
+.. _Feng2:
+
+**(Feng2)** Z. Feng, ..., and W. Ouyang, Langmuir 39(50), 18198-18207 (2023).
--- a/doc/src/pair_ilp_tmd.rst
+++ b/doc/src/pair_ilp_tmd.rst
@ -22,12 +22,12 @@ Examples
 .. code-block:: LAMMPS

   pair_style  hybrid/overlay ilp/tmd 16.0 1
-   pair_coeff  * * ilp/tmd  MoS2.ILP Mo S S
+   pair_coeff  * * ilp/tmd  TMD.ILP Mo S S

   pair_style  hybrid/overlay sw/mod sw/mod ilp/tmd 16.0
   pair_coeff  * * sw/mod 1  tmd.sw.mod Mo S S NULL NULL NULL
-   pair_coeff  * * sw/mod 2  tmd.sw.mod NULL NULL NULL Mo S S
-   pair_coeff  * * ilp/tmd   MoS2.ILP   Mo S S Mo S S
+   pair_coeff  * * sw/mod 2  tmd.sw.mod NULL NULL NULL W Se Se
+   pair_coeff  * * ilp/tmd   TMD.ILP   Mo S S W Se Se

 Description
 """""""""""
@ -36,7 +36,7 @@ Description

 The *ilp/tmd* style computes the registry-dependent interlayer
 potential (ILP) potential for transition metal dichalcogenides (TMD)
-as described in :ref:`(Ouyang7) <Ouyang7>`.
+as described in :ref:`(Ouyang7) <Ouyang7>` and :ref:`(Jiang) <Jiang>`.

 .. math::

@ -69,7 +69,7 @@ calculating the normals.
   each atom `i`, its six nearest neighboring atoms belonging to the same
   sub-layer are chosen to define the normal vector `{\bf n}_i`.

-The parameter file (e.g. MoS2.ILP), is intended for use with *metal*
+The parameter file (e.g. TMD.ILP), is intended for use with *metal*
 :doc:`units <units>`, with energies in meV. Two additional parameters,
 *S*, and *rcut* are included in the parameter file. *S* is designed to
 facilitate scaling of energies. *rcut* is designed to build the neighbor
@ -77,7 +77,7 @@ list for calculating the normals for each atom pair.

 .. note::

-   The parameters presented in the parameter file (e.g. MoS2.ILP),
+   The parameters presented in the parameter file (e.g. TMD.ILP),
   are fitted with taper function by setting the cutoff equal to 16.0
   Angstrom.  Using different cutoff or taper function should be careful.
   These parameters provide a good description in both short- and long-range
@ -133,10 +133,10 @@ if LAMMPS was built with that package.  See the :doc:`Build package
 This pair style requires the newton setting to be *on* for pair
 interactions.

-The MoS2.ILP potential file provided with LAMMPS (see the potentials
+The TMD.ILP potential file provided with LAMMPS (see the potentials
 directory) are parameterized for *metal* units.  You can use this
 potential with any LAMMPS units, but you would need to create your own
-custom MoS2.ILP potential file with coefficients listed in the appropriate
+custom TMD.ILP potential file with coefficients listed in the appropriate
 units, if your simulation does not use *metal* units.

 Related commands
@ -164,3 +164,7 @@ tap_flag = 1
 .. _Ouyang7:

 **(Ouyang7)** W. Ouyang, et al., J. Chem. Theory Comput. 17, 7237 (2021).
+
+.. _Jiang:
+
+**(Jiang)** W. Jiang, et al., J. Phys. Chem. A, 127, 46, 9820-9830 (2023).
--- a/doc/src/run_style.rst
+++ b/doc/src/run_style.rst
@ -329,7 +329,8 @@ Restrictions
 The *verlet/split* style can only be used if LAMMPS was built with the
 REPLICA package. Correspondingly the *respa/omp* style is available
 only if the OPENMP package was included. See the :doc:`Build package
-<Build_package>` page for more info.
+<Build_package>` page for more info.  It is not compatible with
+kspace styles from the INTEL package.

 Whenever using rRESPA, the user should experiment with trade-offs in
 speed and accuracy for their system, and verify that they are
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@ -792,6 +792,7 @@ dispersionflag
 dissipative
 Dissipative
 distharm
+distutils
 dl
 dlabel
 dlambda
--- a/examples/COUPLE/simple/simple.cpp
+++ b/examples/COUPLE/simple/simple.cpp
@ -67,7 +67,7 @@ int main(int narg, char **arg)
  FILE *fp;
  if (me == 0) {
    fp = fopen(arg[2],"r");
-    if (fp == NULL) {
+    if (fp == nullptr) {
      printf("ERROR: Could not open LAMMPS input script\n");
      MPI_Abort(MPI_COMM_WORLD,1);
    }
@ -78,14 +78,14 @@ int main(int narg, char **arg)
  // (could just send it to proc 0 of comm_lammps and let it Bcast)
  // all LAMMPS procs call input->one() on the line
  
-  LAMMPS *lmp = NULL;
-  if (lammps == 1) lmp = new LAMMPS(0,NULL,comm_lammps);
+  LAMMPS *lmp = nullptr;
+  if (lammps == 1) lmp = new LAMMPS(0,nullptr,comm_lammps);

  int n;
  char line[1024];
-  while (1) {
+  while (true) {
    if (me == 0) {
-      if (fgets(line,1024,fp) == NULL) n = 0;
+      if (fgets(line,1024,fp) == nullptr) n = 0;
      else n = strlen(line) + 1;
      if (n == 0) fclose(fp);
    }
@ -101,8 +101,8 @@ int main(int narg, char **arg)
  // put coords back into LAMMPS
  // run a single step with changed coords

-  double *x = NULL;
-  double *v = NULL;
+  double *x = nullptr;
+  double *v = nullptr;

  if (lammps == 1) {
    lmp->input->one("run 10");
@ -147,7 +147,7 @@ int main(int narg, char **arg)
  // create_atoms() to create new ones with old coords, vels
  // initial thermo should be same as step 20

-  int *type = NULL;
+  int *type = nullptr;

  if (lammps == 1) {
    int natoms = static_cast<int> (lmp->atom->natoms);
@ -155,7 +155,7 @@ int main(int narg, char **arg)
    for (int i = 0; i < natoms; i++) type[i] = 1;

    lmp->input->one("delete_atoms group all");
-    lammps_create_atoms(lmp,natoms,NULL,type,x,v,NULL,0);
+    lammps_create_atoms(lmp,natoms,nullptr,type,x,v,nullptr,0);
    lmp->input->one("run 10");
  }

--- a/examples/PACKAGES/interlayer/aip_water_2dm/CBNOH.aip.water.2dm
+++ b/examples/PACKAGES/interlayer/aip_water_2dm/CBNOH.aip.water.2dm
@ -0,0 +1 @@
+../../../../potentials/CBNOH.aip.water.2dm
--- a/examples/PACKAGES/interlayer/aip_water_2dm/COH.aip.water.2dm
+++ b/examples/PACKAGES/interlayer/aip_water_2dm/COH.aip.water.2dm
@ -1 +0,0 @@
-../../../../potentials/COH.aip.water.2dm
--- a/examples/PACKAGES/interlayer/ilp_tmds/MoS2.ILP
+++ b/examples/PACKAGES/interlayer/ilp_tmds/MoS2.ILP
@ -1 +0,0 @@
-../../../../potentials/MoS2.ILP
--- a/examples/PACKAGES/interlayer/ilp_tmds/TMD.ILP
+++ b/examples/PACKAGES/interlayer/ilp_tmds/TMD.ILP
@ -0,0 +1 @@
+../../../../potentials/TMD.ILP
--- a/examples/PACKAGES/interlayer/ilp_tmds/in.mos2
+++ b/examples/PACKAGES/interlayer/ilp_tmds/in.mos2
@ -12,7 +12,7 @@ mass            4 95.94
 pair_style     	hybrid/overlay sw/mod sw/mod ilp/tmd 16.0
 pair_coeff     	* * sw/mod 1 tmd.sw.mod Mo S S NULL NULL NULL
 pair_coeff     	* * sw/mod 2 tmd.sw.mod NULL NULL NULL Mo S S
-pair_coeff     	* * ilp/tmd  MoS2.ILP Mo S S Mo S S
+pair_coeff     	* * ilp/tmd  TMD.ILP Mo S S Mo S S

 # Calculate the pair potential
 compute   	0 all pair ilp/tmd
--- a/potentials/CBNOH.aip.water.2dm
+++ b/potentials/CBNOH.aip.water.2dm
@ -0,0 +1,58 @@
+# DATE: 2023-12-20 UNITS: metal CONTRIBUTOR: Wengen Ouyang w.g.ouyang@gmail.com
+# CITATION: Z. Feng, ..., and W. Ouyang, J. Phys. Chem. C 127(18), 8704 (2023).
+# CITATION: Z. Feng, ..., and W. Ouyang, Langmuir 39(50), 18198-18207 (2023).
+# Anisotropic Potential (AIP) for water/graphene and water/hBN heterojunctions
+# The parameters below are fitted against the PBE + MBD-NL (graphene/water) and SCAN (hBN/water) DFT reference data.
+
+# The parameters for bilayer graphene/graphene, graphene/hBN and hBN/hBN junctions are taken from
+# CITATION: Ouyang, Mandelli, Urbakh, Hod, Nano Letters 18, 6009-6016 (2018).
+#
+# -------------------Repulsion Potential ------------------++++++++++++++ Vdw Potential ++++++++++++++++************
+#    beta(A)    alpha    delta(A)  epsilon(meV)  C(meV)        d         sR      reff(A)   C6(meV*A^6)    S    rcut
+#
+# For water-graphene
+C Ow  5.453696  6.181724  1.250255   3.349092     0.687806  9.057065   1.232495   2.775772  100226.555031  1.0  2.0
+C Hw  2.553809  9.686644  1.964892  41.776171   -16.300128  9.015685   0.744155   2.415456    7409.128564  1.0  2.0
+Ow C  5.453696  6.181724  1.250255   3.349092     0.687806  9.057065   1.232495   2.775772  100226.555031  1.0  1.2
+Hw C  2.553809  9.686644  1.964892  41.776171   -16.300128  9.015685   0.744155   2.415456    7409.128564  1.0  1.2
+
+# For water-hBN
+N Ow  3.530598  16.377816  1.285374  1.717537    1.339337  24.797794   0.771411   3.928357   33589.850651  1.0  2.0
+N Hw  4.029390   5.360546  0.950352 15.945549   -1.486701  10.797276   1.352684   2.293775   41247.181447  1.0  2.0
+B Ow  3.907514   7.842519  2.380078 32.122737    1.190485  17.482482   0.788174   2.368217  139539.370785  1.0  2.0
+B Hw  3.804966   2.356248  1.114761  9.193309   -5.922514   9.000572   1.334703   1.746122   43796.489158  1.0  2.0
+Ow N  3.530598  16.377816  1.285374  1.717537    1.339337  24.797794   0.771411   3.928357   33589.850651  1.0  1.2
+Hw N  4.029390   5.360546  0.950352 15.945549   -1.486701  10.797276   1.352684   2.293775   41247.181447  1.0  1.2
+Ow B  3.907514   7.842519  2.380078 32.122737    1.190485  17.482482   0.788174   2.368217  139539.370785  1.0  1.2
+Hw B  3.804966   2.356248  1.114761  9.193309   -5.922514   9.000572   1.334703   1.746122   43796.489158  1.0  1.2
+
+# For graphene and hydrocarbons
+C C   3.205843  7.511126  1.235334  1.528338E-5  37.530428  15.499947  0.7954443  3.681440  25.714535E3   1.0   2.0
+H H   3.974540   6.53799  1.080633  0.6700556    0.8333833  15.022371  0.7490632  2.767223  1.6159581E3   1.0   1.2
+C H   2.642950  12.91410  1.020257  0.9750012    25.340996  15.222927  0.8115998  3.887324  5.6874617E3   1.0   1.5
+H C   2.642950  12.91410  1.020257  0.9750012    25.340996  15.222927  0.8115998  3.887324  5.6874617E3   1.0   1.5
+
+# For hBN
+B B   3.143737  9.825139  1.936405   2.7848400   14.495957   15.199263  0.7834022  3.682950  49.498013E3   1.0  2.0
+N N   3.443196  7.084490  1.747349   2.9139991   46.508553   15.020370  0.8008370  3.551843  14.810151E3   1.0  2.0
+B N   3.295257  7.224311  2.872667   1.3715032    0.4347152  14.594578  0.8044028  3.765728  24.669996E3   1.0  2.0
+B H   2.718657  9.214551  3.273063  14.015714    14.760509   15.084752  0.7768383  3.640866  7.9642467E3   1.0  1.5
+N B   3.295257  7.224311  2.872667   1.3715032    0.4347152  14.594578  0.8044028  3.765728  24.669996E3   1.0  2.0
+H B   2.718657  9.214551  3.273063  14.015714    14.760509   15.084752  0.7768383  3.640866  7.9642467E3   1.0  1.5
+
+# For graphene-hBN
+C B  3.303662  10.54415   2.926741  16.719972     0.3571734  15.305254  0.7001581  3.097327  30.162869E3   1.0  2.0
+C N  3.253564   8.825921  1.059550  18.344740    21.913573   15.000000  0.7234983  3.013117  19.063095E3   1.0  2.0
+B C  3.303662  10.54415   2.926741  16.719972     0.3571734  15.305254  0.7001581  3.097327  30.162869E3   1.0  2.0
+N C  3.253564   8.825921  1.059550  18.344740    21.913573   15.000000  0.7234983  3.013117  19.063095E3   1.0  2.0
+
+# The AIPs for other elements are turned off
+H Ow  5.453696   6.181724  1.250255  0.000000   0.000000    9.057065   1.232495   2.775772      0.000000   1.0  1.2
+H Hw  5.453696   6.181724  1.250255  0.000000   0.000000    9.057065   1.232495   2.775772      0.000000   1.0  1.2
+Ow H  5.453696   6.181724  1.250255  0.000000   0.000000    9.057065   1.232495   2.775772      0.000000   1.0  1.2
+Hw H  5.453696   6.181724  1.250255  0.000000   0.000000    9.057065   1.232495   2.775772      0.000000   1.0  1.2
+
+Ow Ow 5.453696   6.181724  1.250255  0.000000   0.000000    9.057065   1.232495   2.775772      0.000000   1.0  1.2
+Hw Hw 5.453696   6.181724  1.250255  0.000000   0.000000    9.057065   1.232495   2.775772      0.000000   1.0  1.2
+Ow Hw 5.453696   6.181724  1.250255  0.000000   0.000000    9.057065   1.232495   2.775772      0.000000   1.0  1.2
+Hw Ow 5.453696   6.181724  1.250255  0.000000   0.000000    9.057065   1.232495   2.775772      0.000000   1.0  1.2
--- a/potentials/COH.aip.water.2dm
+++ b/potentials/COH.aip.water.2dm
@ -1,28 +0,0 @@
-# DATE: 2022-12-02 UNITS: metal CONTRIBUTOR: Wengen Ouyang w.g.ouyang@gmail.com CITATION: Z. Feng, ..., and W. Ouyang, J. Phys. Chem. C 127, 8704 (2023).
-# Anisotropic Interfacial Potential (AIP) parameters for water/graphene heterojunctions
-# The parameters below are fitted against the PBE + MBD-NL DFT reference data from 2.5 A to 15 A.
-#
-# ----------------- Repulsion Potential ------------------++++++++++++++ Vdw Potential ++++++++++++++++************
-#    beta(A)    alpha    delta(A)  epsilon(meV)  C(meV)        d         sR      reff(A)   C6(meV*A^6)    S    rcut
-# For graphene and hydrocarbons
-C C  3.205843  7.511126  1.235334  1.528338E-5  37.530428  15.499947  0.7954443  3.681440  25.714535E3   1.0   2.0
-H H  3.974540   6.53799  1.080633  0.6700556    0.8333833  15.022371  0.7490632  2.767223  1.6159581E3   1.0   1.2
-C H  2.642950  12.91410  1.020257  0.9750012    25.340996  15.222927  0.8115998  3.887324  5.6874617E3   1.0   1.5
-H C  2.642950  12.91410  1.020257  0.9750012    25.340996  15.222927  0.8115998  3.887324  5.6874617E3   1.0   1.5
-
-# For water-graphene
-C Ow  5.45369612 6.18172364 1.25025450  3.34909245   0.68780636 9.05706482 1.23249498 2.77577173 100226.55503127   1.0   2.0
-C Hw  2.55380862 9.68664390 1.96489198 41.77617053 -16.30012807 9.01568534 0.74415463 2.41545571   7409.12856378   1.0   2.0
-Ow C  5.45369612 6.18172364 1.25025450  3.34909245   0.68780636 9.05706482 1.23249498 2.77577173 100226.55503127   1.0   1.2
-Hw C  2.55380862 9.68664390 1.96489198 41.77617053 -16.30012807 9.01568534 0.74415463 2.41545571   7409.12856378   1.0   1.2
-
-# # The ILPs for other systems are set to zero
-H Ow  5.45369612 6.18172364 1.25025450 0.00000000 0.00000000 9.05706482 1.23249498 2.77577173 0.00000000    1.0     1.2
-H Hw  5.45369612 6.18172364 1.25025450 0.00000000 0.00000000 9.05706482 1.23249498 2.77577173 0.00000000    1.0     1.2
-Ow H  5.45369612 6.18172364 1.25025450 0.00000000 0.00000000 9.05706482 1.23249498 2.77577173 0.00000000    1.0     1.2
-Hw H  5.45369612 6.18172364 1.25025450 0.00000000 0.00000000 9.05706482 1.23249498 2.77577173 0.00000000    1.0     1.2
-
-Ow Ow 5.45369612 6.18172364 1.25025450 0.00000000 0.00000000 9.05706482 1.23249498 2.77577173 0.00000000    1.0     1.2
-Hw Hw 5.45369612 6.18172364 1.25025450 0.00000000 0.00000000 9.05706482 1.23249498 2.77577173 0.00000000    1.0     1.2
-Ow Hw 5.45369612 6.18172364 1.25025450 0.00000000 0.00000000 9.05706482 1.23249498 2.77577173 0.00000000    1.0     1.2
-Hw Ow 5.45369612 6.18172364 1.25025450 0.00000000 0.00000000 9.05706482 1.23249498 2.77577173 0.00000000    1.0     1.2
--- a/potentials/TMD.ILP
+++ b/potentials/TMD.ILP
@ -0,0 +1,25 @@
+# DATE: 2021-12-02 UNITS: metal CONTRIBUTOR: Wengen Ouyang w.g.ouyang@gmail.com
+# CITATION: W. Ouyang, et al., J. Chem. Theory Comput. 17, 7237 (2021).
+# CITATION: W.  Jiang, et al., J. Phys. Chem. A, 127, 46, 9820–9830 (2023).
+# Interlayer Potential (ILP) for bilayer and bulk Group-VI Transition Metal Dichalcogenides.
+# The parameters below are fitted against the HSE + MBD-NL DFT reference data.
+#
+# -------------------- Repulsion Potential -------------------++++++++++++++++ Vdw Potential ++++++++++++++++*********
+#       beta(A)     alpha    delta(A)  epsilon(meV)  C(meV)         d         sR     reff(A)    C6(meV*A^6)    S   rcut
+Mo Mo  5.579450   9.377662   2.027222  144.151775   97.978570   89.437597  2.059031  5.122055  491850.316195  1.0  4.0
+W   W  5.530854   6.624992   1.983208    0.271792  140.174059  107.392585  1.356333  4.437591  691850.243962  1.0  4.0
+S   S  3.161402   8.093263   1.953140    4.586764  118.065466   58.809416  0.215367  4.299600  148811.243409  1.0  4.0
+Se Se  3.938627  10.515924   2.415783    3.012583   22.400612  116.864517  0.151121  5.884241  112506.195626  1.0  4.0
+Mo  W  5.412298   8.647128   2.108665   51.177950  184.342860  201.281256  2.547743  2.492287   99996.913401  1.0  4.0
+Mo  S  3.627152  19.971375   7.585031   76.101931    3.317496   45.720328  0.947470  4.410425  150597.857716  1.0  4.0
+Mo Se  6.196447   4.844134  14.362005    7.407221    0.058823   27.156223  0.976771  3.979186  786029.840651  1.0  4.0
+W   S  3.680136  11.163004  32.254117  110.019679   79.381335  138.340438  0.900750  8.875776  250600.809034  1.0  4.0
+W  Se  3.559392  20.638856   1.202717   20.478669  197.422484   10.005271  1.052738  3.815817  288321.561114  1.0  4.0
+S  Se  2.820092   7.491151   1.933323  141.532559  293.127817   90.470904  0.390492  4.170885  117688.987069  1.0  4.0
+# Symmetric Atom Pair
+W  Mo  5.412298   8.647128   2.108665   51.177950  184.342860  201.281256  2.547743  2.492287   99996.913401  1.0  4.0
+S  Mo  3.627152  19.971375   7.585031   76.101931    3.317496   45.720328  0.947470  4.410425  150597.857716  1.0  4.0
+Se Mo  6.196447   4.844134  14.362005    7.407221    0.058823   27.156223  0.976771  3.979186  786029.840651  1.0  4.0
+S   W  3.680136  11.163004  32.254117  110.019679   79.381335  138.340438  0.900750  8.875776  250600.809034  1.0  4.0
+Se  W  3.559392  20.638856   1.202717   20.478669  197.422484   10.005271  1.052738  3.815817  288321.561114  1.0  4.0
+Se  S  2.820092   7.491151   1.933323  141.532559  293.127817   90.470904  0.390492  4.170885  117688.987069  1.0  4.0
--- a/src/DRUDE/fix_tgnh_drude.cpp
+++ b/src/DRUDE/fix_tgnh_drude.cpp
@ -1076,7 +1076,6 @@ void FixTGNHDrude::couple()

 void FixTGNHDrude::remap()
 {
-  int i;
  double oldlo,oldhi;
  double expfac;

--- a/src/EXTRA-COMPUTE/compute_slcsa_atom.cpp
+++ b/src/EXTRA-COMPUTE/compute_slcsa_atom.cpp
@ -305,84 +305,84 @@ void ComputeSLCSAAtom::compute_peratom()

  int *mask = atom->mask;
  int nlocal = atom->nlocal;
-  double **compute_array;

  if (descriptorval.which == ArgInfo::COMPUTE) {
    if (!(descriptorval.val.c->invoked_flag & Compute::INVOKED_PERATOM)) {
      descriptorval.val.c->compute_peratom();
      descriptorval.val.c->invoked_flag |= Compute::INVOKED_PERATOM;
    }
-    compute_array = descriptorval.val.c->array_atom;
-  }
+    double **compute_array = descriptorval.val.c->array_atom;

-  memory->create(full_descriptor, ncomps, "slcsa/atom:local descriptor");
-  memory->create(projected_descriptor, nclasses - 1, "slcsa/atom:reduced descriptor");
-  memory->create(scores, nclasses, "slcsa/atom:scores");
-  memory->create(probas, nclasses, "slcsa/atom:probas");
-  memory->create(prodright, nclasses - 1, "slcsa/atom:prodright");
-  memory->create(dmaha, nclasses, "slcsa/atom:prodright");
+    memory->create(full_descriptor, ncomps, "slcsa/atom:local descriptor");
+    memory->create(projected_descriptor, nclasses - 1, "slcsa/atom:reduced descriptor");
+    memory->create(scores, nclasses, "slcsa/atom:scores");
+    memory->create(probas, nclasses, "slcsa/atom:probas");
+    memory->create(prodright, nclasses - 1, "slcsa/atom:prodright");
+    memory->create(dmaha, nclasses, "slcsa/atom:prodright");

-  for (int i = 0; i < nlocal; i++) {
-    if (mask[i] & groupbit) {
-      for (int j = 0; j < ncomps; j++) { full_descriptor[j] = compute_array[i][j]; }
-      // Here comes the LDA + LR process
-      // 1st step : Retrieve mean database descriptor
-      for (int j = 0; j < ncomps; j++) { full_descriptor[j] -= database_mean_descriptor[j]; }
-      // 2nd step : Matrix multiplication to go from ncompsx1 -> (nclasses-1)*1
-      for (int j = 0; j < nclasses - 1; j++) {
-        projected_descriptor[j] = 0.;
-        for (int k = 0; k < ncomps; k++) {
-          projected_descriptor[j] += full_descriptor[k] * lda_scalings[k][j];
-        }
-      }
-      // 3rd step : Matrix multiplication
-      for (int j = 0; j < nclasses; j++) {
-        scores[j] = lr_bias[j];
-        for (int k = 0; k < nclasses - 1; k++) {
-          scores[j] += lr_decision[j][k] * projected_descriptor[k];
-        }
-      }
-      // 4th step : Matrix multiplication
-      double sumexpscores = 0.;
-      for (int j = 0; j < nclasses; j++) sumexpscores += exp(scores[j]);
-      for (int j = 0; j < nclasses; j++) { probas[j] = exp(scores[j]) / sumexpscores; }
-
-      classification[i][nclasses] = argmax(probas, nclasses);
-
-      // 5th step : Mahalanobis distance
-      for (int j = 0; j < nclasses; j++) {
-        prodright[0] = 0.;
-        prodright[1] = 0.;
-        prodright[2] = 0.;
-        for (int k = 0; k < nclasses - 1; k++) {
-          for (int l = 0; l < nclasses - 1; l++) {
-            prodright[k] +=
-                (icov_list[j][k][l] * (projected_descriptor[k] - mean_projected_descriptors[j][k]));
+    for (int i = 0; i < nlocal; i++) {
+      if (mask[i] & groupbit) {
+        for (int j = 0; j < ncomps; j++) full_descriptor[j] = compute_array[i][j];
+        // Here comes the LDA + LR process
+        // 1st step : Retrieve mean database descriptor
+        for (int j = 0; j < ncomps; j++) full_descriptor[j] -= database_mean_descriptor[j];
+        // 2nd step : Matrix multiplication to go from ncompsx1 -> (nclasses-1)*1
+        for (int j = 0; j < nclasses - 1; j++) {
+          projected_descriptor[j] = 0.0;
+          for (int k = 0; k < ncomps; k++) {
+            projected_descriptor[j] += full_descriptor[k] * lda_scalings[k][j];
          }
        }
-        double prodleft = 0.;
-        for (int k = 0; k < nclasses - 1; k++) {
-          prodleft += (prodright[k] * (projected_descriptor[k] - mean_projected_descriptors[j][k]));
+        // 3rd step : Matrix multiplication
+        for (int j = 0; j < nclasses; j++) {
+          scores[j] = lr_bias[j];
+          for (int k = 0; k < nclasses - 1; k++) {
+            scores[j] += lr_decision[j][k] * projected_descriptor[k];
+          }
        }
-        classification[i][j] = sqrt(prodleft);
-      }
-      // 6th step : Sanity check
-      int locclass = classification[i][nclasses];
+        // 4th step : Matrix multiplication
+        double sumexpscores = 0.0;
+        for (int j = 0; j < nclasses; j++) sumexpscores += exp(scores[j]);
+        for (int j = 0; j < nclasses; j++) probas[j] = exp(scores[j]) / sumexpscores;

-      if (classification[i][locclass] > maha_thresholds[locclass]) {
-        classification[i][nclasses] = -1.;
-      }
+        classification[i][nclasses] = argmax(probas, nclasses);

-    } else {
-      for (int j = 0; j < ncols; j++) { classification[i][j] = -1.; }
+        // 5th step : Mahalanobis distance
+        for (int j = 0; j < nclasses; j++) {
+          prodright[0] = 0.0;
+          prodright[1] = 0.0;
+          prodright[2] = 0.0;
+          for (int k = 0; k < nclasses - 1; k++) {
+            for (int l = 0; l < nclasses - 1; l++) {
+              prodright[k] += (icov_list[j][k][l] *
+                               (projected_descriptor[k] - mean_projected_descriptors[j][k]));
+            }
+          }
+          double prodleft = 0.0;
+          for (int k = 0; k < nclasses - 1; k++) {
+            prodleft +=
+                (prodright[k] * (projected_descriptor[k] - mean_projected_descriptors[j][k]));
+          }
+          classification[i][j] = sqrt(prodleft);
+        }
+        // 6th step : Sanity check
+        int locclass = classification[i][nclasses];
+
+        if (classification[i][locclass] > maha_thresholds[locclass]) {
+          classification[i][nclasses] = -1.0;
+        }
+
+      } else {
+        for (int j = 0; j < ncols; j++) classification[i][j] = -1.0;
+      }
    }
+    memory->destroy(full_descriptor);
+    memory->destroy(projected_descriptor);
+    memory->destroy(scores);
+    memory->destroy(probas);
+    memory->destroy(prodright);
+    memory->destroy(dmaha);
  }
-  memory->destroy(full_descriptor);
-  memory->destroy(projected_descriptor);
-  memory->destroy(scores);
-  memory->destroy(probas);
-  memory->destroy(prodright);
-  memory->destroy(dmaha);
 }

 int ComputeSLCSAAtom::compute_ncomps(int twojmax)
--- a/src/EXTRA-FIX/fix_nonaffine_displacement.cpp
+++ b/src/EXTRA-FIX/fix_nonaffine_displacement.cpp
@ -71,6 +71,7 @@ FixNonaffineDisplacement::FixNonaffineDisplacement(LAMMPS *lmp, int narg, char *
  nevery = utils::inumeric(FLERR, arg[3], false, lmp);
  if (nevery <= 0) error->all(FLERR,"Illegal nevery value {} in fix nonaffine/displacement", nevery);

+  reference_timestep = update_timestep = offset_timestep = -1;
  int iarg = 4;
  if (strcmp(arg[iarg], "integrated") == 0) {
    nad_style = INTEGRATED;
@ -99,7 +100,7 @@ FixNonaffineDisplacement::FixNonaffineDisplacement(LAMMPS *lmp, int narg, char *
  if (strcmp(arg[iarg], "fixed") == 0) {
    reference_style = FIXED;
    reference_timestep = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
-    if (update_timestep < 0)
+    if (reference_timestep < 0)
      error->all(FLERR, "Illegal reference timestep {} in fix nonaffine/displacement", arg[iarg + 1]);
  } else if (strcmp(arg[iarg], "update") == 0) {
    reference_style = UPDATE;
--- a/src/EXTRA-FIX/fix_nonaffine_displacement.h
+++ b/src/EXTRA-FIX/fix_nonaffine_displacement.h
@ -32,7 +32,7 @@ class FixNonaffineDisplacement : public Fix {
  void post_constructor() override;
  void init() override;
  void init_list(int, class NeighList *) override;
-  void setup(int);
+  void setup(int) override;
  void post_force(int) override;
  void write_restart(FILE *fp) override;
  void restart(char *buf) override;
@ -62,7 +62,7 @@ class FixNonaffineDisplacement : public Fix {
  void calculate_D2Min();
  void save_reference_state();
  void minimum_image0(double *);
-  void grow_arrays(int);
+  void grow_arrays(int) override;
 };

 }    // namespace LAMMPS_NS
--- a/src/INTEL/fix_intel.cpp
+++ b/src/INTEL/fix_intel.cpp
@ -553,6 +553,9 @@ void FixIntel::kspace_init_check()

  if (intel_pair == 0)
    error->all(FLERR,"Intel styles for kspace require intel pair style.");
+
+  if (utils::strmatch(update->integrate_style, "^verlet/split"))
+    error->all(FLERR,"Intel styles for kspace are not compatible with run_style verlet/split");
 }

 /* ---------------------------------------------------------------------- */
--- a/src/INTERLAYER/pair_ilp_tmd.cpp
+++ b/src/INTERLAYER/pair_ilp_tmd.cpp
@ -210,7 +210,7 @@ void PairILPTMD::calc_FRep(int eflag, int /* vflag */)
          delki[1] = x[k][1] - x[i][1];
          delki[2] = x[k][2] - x[i][2];
          if (evflag)
-            ev_tally_xyz(k, j, nlocal, newton_pair, 0.0, 0.0, fk[0], fk[1], fk[2], delki[0],
+            ev_tally_xyz(k, i, nlocal, newton_pair, 0.0, 0.0, fk[0], fk[1], fk[2], delki[0],
                         delki[1], delki[2]);
        }

--- a/src/INTERLAYER/pair_kolmogorov_crespi_full.cpp
+++ b/src/INTERLAYER/pair_kolmogorov_crespi_full.cpp
@ -590,7 +590,7 @@ void PairKolmogorovCrespiFull::calc_FRep(int eflag, int /* vflag */)
          delki[1] = x[k][1] - x[i][1];
          delki[2] = x[k][2] - x[i][2];
          if (evflag)
-            ev_tally_xyz(k, j, nlocal, newton_pair, 0.0, 0.0, fk[0], fk[1], fk[2], delki[0],
+            ev_tally_xyz(k, i, nlocal, newton_pair, 0.0, 0.0, fk[0], fk[1], fk[2], delki[0],
                         delki[1], delki[2]);
        }

--- a/src/KOKKOS/atom_kokkos.cpp
+++ b/src/KOKKOS/atom_kokkos.cpp
@ -25,6 +25,7 @@
 #include "kokkos_base.h"
 #include "modify.h"
 #include "fix.h"
+#include "fix_property_atom_kokkos.h"

 using namespace LAMMPS_NS;

--- a/src/KOKKOS/atom_kokkos.h
+++ b/src/KOKKOS/atom_kokkos.h
@ -14,7 +14,6 @@

 #include "atom.h"               // IWYU pragma: export
 #include "kokkos_type.h"
-#include "fix_property_atom_kokkos.h"

 #include <Kokkos_Sort.hpp>

@ -27,7 +26,7 @@ class AtomKokkos : public Atom {
 public:
  bool sort_classic;
  int nprop_atom;
-  FixPropertyAtomKokkos** fix_prop_atom;
+  class FixPropertyAtomKokkos **fix_prop_atom;

  DAT::tdual_tagint_1d k_tag;
  DAT::tdual_int_1d k_type, k_mask;
--- a/src/KOKKOS/grid3d_kokkos.cpp
+++ b/src/KOKKOS/grid3d_kokkos.cpp
@ -640,7 +640,7 @@ void Grid3dKokkos<DeviceType>::forward_comm(int caller, void *ptr, int which, in
                            MPI_Datatype datatype)
 {
  if (caller == KSPACE) {
-    if (layout != Comm::LAYOUT_TILED)
+    if (comm->layout != Comm::LAYOUT_TILED)
    forward_comm_kspace_brick((KSpace *) ptr,which,nper,k_buf1,k_buf2,datatype);
  else
    forward_comm_kspace_tiled((KSpace *) ptr,which,nper,k_buf1,k_buf2,datatype);
@ -780,7 +780,7 @@ void Grid3dKokkos<DeviceType>::reverse_comm(int caller, void *ptr, int which, in
                            MPI_Datatype datatype)
 {
  if (caller == KSPACE) {
-    if (layout != Comm::LAYOUT_TILED)
+    if (comm->layout != Comm::LAYOUT_TILED)
      reverse_comm_kspace_brick((KSpace *) ptr,which,nper,k_buf1,k_buf2,datatype);
    else
      reverse_comm_kspace_tiled((KSpace *) ptr,which,nper,k_buf1,k_buf2,datatype);
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@ -608,8 +608,8 @@ void KokkosLMP::accelerator(int narg, char **arg)

  force->newton = force->newton_pair = force->newton_bond = newtonflag;

-  if (neigh_thread && neighflag != FULL)
-    error->all(FLERR,"Must use KOKKOS package option 'neigh full' with 'neigh/thread on'");
+  if (neigh_thread && newtonflag)
+    error->all(FLERR,"Must use KOKKOS package option 'newton off' with 'neigh/thread on'");

  neighbor->binsize_user = binsize;
  if (binsize <= 0.0) neighbor->binsizeflag = 0;
--- a/src/KOKKOS/min_kokkos.cpp
+++ b/src/KOKKOS/min_kokkos.cpp
@ -21,6 +21,7 @@
 #include "angle.h"
 #include "atom_kokkos.h"
 #include "atom_masks.h"
+#include "atom_vec.h"
 #include "bond.h"
 #include "comm.h"
 #include "compute.h"
--- a/src/KOKKOS/pair_kokkos.h
+++ b/src/KOKKOS/pair_kokkos.h
@ -19,10 +19,12 @@
 #ifndef LMP_PAIR_KOKKOS_H
 #define LMP_PAIR_KOKKOS_H

-#include "Kokkos_Macros.hpp"
 #include "pair.h"               // IWYU pragma: export
 #include "neighbor_kokkos.h"
 #include "neigh_list_kokkos.h"
+#include "math_special.h"
+#include "update.h"
+#include "Kokkos_Macros.hpp"
 #include "Kokkos_ScatterView.hpp"

 namespace LAMMPS_NS {
@ -63,6 +65,7 @@ struct PairComputeFunctor  {
  typename AT::t_f_array f;
  typename AT::t_efloat_1d d_eatom;
  typename AT::t_virial_array d_vatom;
+  int inum;

  using KKDeviceType = typename KKDevice<device_type>::value;
  using DUP = NeedDup_v<NEIGHFLAG,device_type>;
@ -81,8 +84,6 @@ struct PairComputeFunctor  {
  //             typename KKDevice<device_type>::value,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > vatom;
  KKScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,KKDeviceType,KKScatterSum,DUP> dup_vatom;

-
-
  NeighListKokkos<device_type> list;

  PairComputeFunctor(PairStyle* c_ptr,
@ -95,6 +96,7 @@ struct PairComputeFunctor  {
    dup_f     = Kokkos::Experimental::create_scatter_view<KKScatterSum, DUP>(c.f);
    dup_eatom = Kokkos::Experimental::create_scatter_view<KKScatterSum, DUP>(c.d_eatom);
    dup_vatom = Kokkos::Experimental::create_scatter_view<KKScatterSum, DUP>(c.d_vatom);
+    inum = list.inum;
  };

  // Set copymode = 1 so parent allocations aren't destructed by copies of the style
@ -105,17 +107,22 @@ struct PairComputeFunctor  {
  }

  void contribute() {
-    Kokkos::Experimental::contribute(c.f, dup_f);
+    int need_dup = std::is_same_v<DUP,Kokkos::Experimental::ScatterDuplicated>;

-    if (c.eflag_atom)
-      Kokkos::Experimental::contribute(c.d_eatom, dup_eatom);
+    if (need_dup) {
+      Kokkos::Experimental::contribute(c.f, dup_f);

-    if (c.vflag_atom)
-      Kokkos::Experimental::contribute(c.d_vatom, dup_vatom);
+      if (c.eflag_atom)
+        Kokkos::Experimental::contribute(c.d_eatom, dup_eatom);
+
+      if (c.vflag_atom)
+        Kokkos::Experimental::contribute(c.d_vatom, dup_vatom);
+    }
  }

  // Loop over neighbors of one atom without coulomb interaction
  // This function is called in parallel
+
  template<int EVFLAG, int NEWTON_PAIR>
  KOKKOS_FUNCTION
  EV_FLOAT compute_item(const int& ii,
@ -161,7 +168,7 @@ struct PairComputeFunctor  {
        fytmp += dely*fpair;
        fztmp += delz*fpair;

-        if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) {
+        if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) {
          a_f(j,0) -= delx*fpair;
          a_f(j,1) -= dely*fpair;
          a_f(j,2) -= delz*fpair;
@ -169,9 +176,9 @@ struct PairComputeFunctor  {

        if (EVFLAG) {
          F_FLOAT evdwl = 0.0;
-          if (c.eflag) {
+          if (c.eflag_either) {
            evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
-            ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<c.nlocal)))?1.0:0.5)*evdwl;
+            ev.evdwl += (((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD)&&(NEWTON_PAIR||(j<c.nlocal)))?1.0:0.5)*evdwl;
          }

          if (c.vflag_either || c.eflag_atom) ev_tally(ev,i,j,evdwl,fpair,delx,dely,delz);
@ -189,6 +196,7 @@ struct PairComputeFunctor  {

  // Loop over neighbors of one atom with coulomb interaction
  // This function is called in parallel
+
  template<int EVFLAG, int NEWTON_PAIR>
  KOKKOS_FUNCTION
  EV_FLOAT compute_item(const int& ii,
@ -241,7 +249,7 @@ struct PairComputeFunctor  {
        fytmp += dely*fpair;
        fztmp += delz*fpair;

-        if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) {
+        if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) {
          a_f(j,0) -= delx*fpair;
          a_f(j,1) -= dely*fpair;
          a_f(j,2) -= delz*fpair;
@ -250,14 +258,14 @@ struct PairComputeFunctor  {
        if (EVFLAG) {
          F_FLOAT evdwl = 0.0;
          F_FLOAT ecoul = 0.0;
-          if (c.eflag) {
+          if (c.eflag_either) {
            if (rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) {
              evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
-              ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<c.nlocal)))?1.0:0.5)*evdwl;
+              ev.evdwl += (((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && (NEWTON_PAIR || (j < c.nlocal)))?1.0:0.5)*evdwl;
            }
            if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) {
              ecoul = c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
-              ev.ecoul += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<c.nlocal)))?1.0:0.5)*ecoul;
+              ev.ecoul += (((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && (NEWTON_PAIR || (j < c.nlocal)))?1.0:0.5)*ecoul;
            }
          }

@ -273,14 +281,16 @@ struct PairComputeFunctor  {
    return ev;
  }

-  // Use TeamPolicy, assume Newton off, Full Neighborlist, and no energy/virial
+  // TeamPolicy, newton off, and no energy/virial
  // Loop over neighbors of one atom without coulomb interaction
  // This function is called in parallel
+
  KOKKOS_FUNCTION
  void compute_item_team(typename Kokkos::TeamPolicy<device_type>::member_type team,
                         const NeighListKokkos<device_type> &list, const NoCoulTag&) const {

-    const int inum = team.league_size();
+    auto a_f = dup_f.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
+
    const int atoms_per_team = team.team_size();
    const int firstatom = team.league_rank()*atoms_per_team;
    const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
@ -292,7 +302,7 @@ struct PairComputeFunctor  {
      const X_FLOAT ztmp = c.x(i,2);
      const int itype = c.type(i);

-      if (ZEROFLAG) {
+      if (NEIGHFLAG == FULL && ZEROFLAG) {
        Kokkos::single(Kokkos::PerThread(team), [&] (){
          f(i,0) = 0.0;
          f(i,1) = 0.0;
@ -321,30 +331,42 @@ struct PairComputeFunctor  {

          const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);

-          ftmp.x += delx*fpair;
-          ftmp.y += dely*fpair;
-          ftmp.z += delz*fpair;
+          const F_FLOAT fx = delx*fpair;
+          const F_FLOAT fy = dely*fpair;
+          const F_FLOAT fz = delz*fpair;
+
+          ftmp.x += fx;
+          ftmp.y += fy;
+          ftmp.z += fz;
+
+          if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal) {
+            a_f(j,0) -= fx;
+            a_f(j,1) -= fy;
+            a_f(j,2) -= fz;
+          }
        }

      },fsum);

      Kokkos::single(Kokkos::PerThread(team), [&] () {
-        f(i,0) += fsum.x;
-        f(i,1) += fsum.y;
-        f(i,2) += fsum.z;
+        a_f(i,0) += fsum.x;
+        a_f(i,1) += fsum.y;
+        a_f(i,2) += fsum.z;
      });

    });
  }

-  // Use TeamPolicy, assume Newton off, Full Neighborlist, and no energy/virial
+  // TeamPolicy, newton off, and no energy/virial
  // Loop over neighbors of one atom with coulomb interaction
  // This function is called in parallel
+
  KOKKOS_FUNCTION
  void compute_item_team(typename Kokkos::TeamPolicy<device_type>::member_type team,
                         const NeighListKokkos<device_type> &list, const CoulTag& ) const {

-    const int inum = team.league_size();
+    auto a_f = dup_f.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
+
    const int atoms_per_team = team.team_size();
    int firstatom = team.league_rank()*atoms_per_team;
    int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
@ -357,8 +379,9 @@ struct PairComputeFunctor  {
      const int itype = c.type(i);
      const F_FLOAT qtmp = c.q(i);

-      if (ZEROFLAG) {
-        Kokkos::single(Kokkos::PerThread(team), [&] (){
+      if (NEIGHFLAG == FULL && ZEROFLAG) {
+        Kokkos::single(Kokkos::PerThread(team), [&] ()
+        {
          f(i,0) = 0.0;
          f(i,1) = 0.0;
          f(i,2) = 0.0;
@ -391,31 +414,45 @@ struct PairComputeFunctor  {
          if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype)))
            fpair+=c.template compute_fcoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);

-          ftmp.x += delx*fpair;
-          ftmp.y += dely*fpair;
-          ftmp.z += delz*fpair;
+          const F_FLOAT fx = delx*fpair;
+          const F_FLOAT fy = dely*fpair;
+          const F_FLOAT fz = delz*fpair;
+
+          ftmp.x += fx;
+          ftmp.y += fy;
+          ftmp.z += fz;
+
+          if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal) {
+            a_f(j,0) -= fx;
+            a_f(j,1) -= fy;
+            a_f(j,2) -= fz;
+          }
        }
+
      },fsum);

      Kokkos::single(Kokkos::PerThread(team), [&] () {
-      f(i,0) += fsum.x;
-      f(i,1) += fsum.y;
-      f(i,2) += fsum.z;
+        a_f(i,0) += fsum.x;
+        a_f(i,1) += fsum.y;
+        a_f(i,2) += fsum.z;
      });
    });
  }

-
-  // Use TeamPolicy, assume Newton off, Full Neighborlist, and energy/virial
+  // TeamPolicy, newton off, and energy/virial
  // Loop over neighbors of one atom without coulomb interaction
  // This function is called in parallel
+
  KOKKOS_FUNCTION
  EV_FLOAT compute_item_team_ev(typename Kokkos::TeamPolicy<device_type>::member_type team,
                                const NeighListKokkos<device_type> &list, const NoCoulTag&) const {

+    auto a_f = dup_f.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
+    auto a_eatom = dup_eatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
+    auto a_vatom = dup_vatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
+
    EV_FLOAT ev;

-    const int inum = team.league_size();
    const int atoms_per_team = team.team_size();
    const int firstatom = team.league_rank()*atoms_per_team;
    const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
@ -427,8 +464,9 @@ struct PairComputeFunctor  {
      const X_FLOAT ztmp = c.x(i,2);
      const int itype = c.type(i);

-      if (ZEROFLAG) {
-        Kokkos::single(Kokkos::PerThread(team), [&] (){
+      if (NEIGHFLAG == FULL && ZEROFLAG) {
+        Kokkos::single(Kokkos::PerThread(team), [&] ()
+        {
          f(i,0) = 0.0;
          f(i,1) = 0.0;
          f(i,2) = 0.0;
@ -456,37 +494,85 @@ struct PairComputeFunctor  {

          const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);

-          fev_tmp.f[0] += delx*fpair;
-          fev_tmp.f[1] += dely*fpair;
-          fev_tmp.f[2] += delz*fpair;
+          const F_FLOAT fx = delx*fpair;
+          const F_FLOAT fy = dely*fpair;
+          const F_FLOAT fz = delz*fpair;
+
+          fev_tmp.f[0] += fx;
+          fev_tmp.f[1] += fy;
+          fev_tmp.f[2] += fz;
+
+          const int I_CONTRIB = (NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD);
+          const int J_CONTRIB = ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal);
+          const E_FLOAT factor = J_CONTRIB?1.0:0.5;
+
+          if (J_CONTRIB) {
+            a_f(j,0) -= fx;
+            a_f(j,1) -= fy;
+            a_f(j,2) -= fz;
+          }

          F_FLOAT evdwl = 0.0;
-          if (c.eflag) {
+          if (c.eflag_either) {
            evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
-            fev_tmp.evdwl += 0.5*evdwl;
+            fev_tmp.evdwl += factor * evdwl;
+
+            if (c.eflag_atom) {
+              const E_FLOAT epairhalf = 0.5 * evdwl;
+
+              if (I_CONTRIB)
+                a_eatom[i] += epairhalf;
+
+              if (J_CONTRIB)
+                a_eatom[j] += epairhalf;
+            }
          }
+
          if (c.vflag_either) {
-            fev_tmp.v[0] += 0.5*delx*delx*fpair;
-            fev_tmp.v[1] += 0.5*dely*dely*fpair;
-            fev_tmp.v[2] += 0.5*delz*delz*fpair;
-            fev_tmp.v[3] += 0.5*delx*dely*fpair;
-            fev_tmp.v[4] += 0.5*delx*delz*fpair;
-            fev_tmp.v[5] += 0.5*dely*delz*fpair;
+            const E_FLOAT v0 = delx*delx*fpair;
+            const E_FLOAT v1 = dely*dely*fpair;
+            const E_FLOAT v2 = delz*delz*fpair;
+            const E_FLOAT v3 = delx*dely*fpair;
+            const E_FLOAT v4 = delx*delz*fpair;
+            const E_FLOAT v5 = dely*delz*fpair;
+
+            fev_tmp.v[0] += factor*v0;
+            fev_tmp.v[1] += factor*v1;
+            fev_tmp.v[2] += factor*v2;
+            fev_tmp.v[3] += factor*v3;
+            fev_tmp.v[4] += factor*v4;
+            fev_tmp.v[5] += factor*v5;
+
+            if (c.vflag_atom) {
+              if (I_CONTRIB) {
+                a_vatom(i,0) += 0.5*v0;
+                a_vatom(i,1) += 0.5*v1;
+                a_vatom(i,2) += 0.5*v2;
+                a_vatom(i,3) += 0.5*v3;
+                a_vatom(i,4) += 0.5*v4;
+                a_vatom(i,5) += 0.5*v5;
+              }
+              if (J_CONTRIB) {
+                a_vatom(j,0) += 0.5*v0;
+                a_vatom(j,1) += 0.5*v1;
+                a_vatom(j,2) += 0.5*v2;
+                a_vatom(j,3) += 0.5*v3;
+                a_vatom(j,4) += 0.5*v4;
+                a_vatom(j,5) += 0.5*v5;
+              }
+            }
          }
        }
      },fev);

      Kokkos::single(Kokkos::PerThread(team), [&] () {
-        f(i,0) += fev.f[0];
-        f(i,1) += fev.f[1];
-        f(i,2) += fev.f[2];
+        a_f(i,0) += fev.f[0];
+        a_f(i,1) += fev.f[1];
+        a_f(i,2) += fev.f[2];

        if (c.eflag_global)
          ev.evdwl += fev.evdwl;

-        if (c.eflag_atom)
-          d_eatom(i) += fev.evdwl;
-
        if (c.vflag_global) {
          ev.v[0] += fev.v[0];
          ev.v[1] += fev.v[1];
@ -496,29 +582,39 @@ struct PairComputeFunctor  {
          ev.v[5] += fev.v[5];
        }

-        if (c.vflag_atom) {
-          d_vatom(i,0) += fev.v[0];
-          d_vatom(i,1) += fev.v[1];
-          d_vatom(i,2) += fev.v[2];
-          d_vatom(i,3) += fev.v[3];
-          d_vatom(i,4) += fev.v[4];
-          d_vatom(i,5) += fev.v[5];
+        if (NEIGHFLAG == FULL) {
+
+          if (c.eflag_atom)
+            a_eatom(i) += fev.evdwl;
+
+          if (c.vflag_atom) {
+            a_vatom(i,0) += fev.v[0];
+            a_vatom(i,1) += fev.v[1];
+            a_vatom(i,2) += fev.v[2];
+            a_vatom(i,3) += fev.v[3];
+            a_vatom(i,4) += fev.v[4];
+            a_vatom(i,5) += fev.v[5];
+          }
        }
      });
    });
    return ev;
  }

-  // Use TeamPolicy, assume Newton off, Full Neighborlist, and energy/virial
+  // TeamPolicy, newton off, and energy/virial
  // Loop over neighbors of one atom with coulomb interaction
  // This function is called in parallel
+
  KOKKOS_FUNCTION
  EV_FLOAT compute_item_team_ev(typename Kokkos::TeamPolicy<device_type>::member_type team,
                                const NeighListKokkos<device_type> &list, const CoulTag& ) const {

+    auto a_f = dup_f.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
+    auto a_eatom = dup_eatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
+    auto a_vatom = dup_vatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
+
    EV_FLOAT ev;

-    const int inum = team.league_size();
    const int atoms_per_team = team.team_size();
    const int firstatom = team.league_rank()*atoms_per_team;
    const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
@ -566,45 +662,92 @@ struct PairComputeFunctor  {
          if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype)))
            fpair+=c.template compute_fcoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);

-          fev_tmp.f[0] += delx*fpair;
-          fev_tmp.f[1] += dely*fpair;
-          fev_tmp.f[2] += delz*fpair;
+          const F_FLOAT fx = delx*fpair;
+          const F_FLOAT fy = dely*fpair;
+          const F_FLOAT fz = delz*fpair;
+
+          fev_tmp.f[0] += fx;
+          fev_tmp.f[1] += fy;
+          fev_tmp.f[2] += fz;
+
+          const int I_CONTRIB = (NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD);
+          const int J_CONTRIB = ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal);
+          const E_FLOAT factor = J_CONTRIB?1.0:0.5;
+
+          if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal) {
+            a_f(j,0) -= fx;
+            a_f(j,1) -= fy;
+            a_f(j,2) -= fz;
+          }

          F_FLOAT evdwl = 0.0;
          F_FLOAT ecoul = 0.0;
-          if (c.eflag) {
+          if (c.eflag_either) {
            if (rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) {
              evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
-              fev_tmp.evdwl += 0.5*evdwl;
+              fev_tmp.evdwl += factor * evdwl;
            }
            if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) {
              ecoul = c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
-              fev_tmp.ecoul += 0.5*ecoul;
+              fev_tmp.ecoul += factor * ecoul;
+            }
+
+
+            if (c.eflag_atom) {
+              const E_FLOAT epairhalf = 0.5 * (evdwl + ecoul);
+
+              if (I_CONTRIB)
+                a_eatom[i] += epairhalf;
+
+              if (J_CONTRIB)
+                a_eatom[j] += epairhalf;
            }
          }
+
          if (c.vflag_either) {
-            fev_tmp.v[0] += 0.5*delx*delx*fpair;
-            fev_tmp.v[1] += 0.5*dely*dely*fpair;
-            fev_tmp.v[2] += 0.5*delz*delz*fpair;
-            fev_tmp.v[3] += 0.5*delx*dely*fpair;
-            fev_tmp.v[4] += 0.5*delx*delz*fpair;
-            fev_tmp.v[5] += 0.5*dely*delz*fpair;
+            const E_FLOAT v0 = delx*delx*fpair;
+            const E_FLOAT v1 = dely*dely*fpair;
+            const E_FLOAT v2 = delz*delz*fpair;
+            const E_FLOAT v3 = delx*dely*fpair;
+            const E_FLOAT v4 = delx*delz*fpair;
+            const E_FLOAT v5 = dely*delz*fpair;
+
+            fev_tmp.v[0] += factor*v0;
+            fev_tmp.v[1] += factor*v1;
+            fev_tmp.v[2] += factor*v2;
+            fev_tmp.v[3] += factor*v3;
+            fev_tmp.v[4] += factor*v4;
+            fev_tmp.v[5] += factor*v5;
+
+            if (c.vflag_atom) {
+              if (I_CONTRIB) {
+                a_vatom(i,0) += 0.5*v0;
+                a_vatom(i,1) += 0.5*v1;
+                a_vatom(i,2) += 0.5*v2;
+                a_vatom(i,3) += 0.5*v3;
+                a_vatom(i,4) += 0.5*v4;
+                a_vatom(i,5) += 0.5*v5;
+              }
+              if (J_CONTRIB) {
+                a_vatom(j,0) += 0.5*v0;
+                a_vatom(j,1) += 0.5*v1;
+                a_vatom(j,2) += 0.5*v2;
+                a_vatom(j,3) += 0.5*v3;
+                a_vatom(j,4) += 0.5*v4;
+                a_vatom(j,5) += 0.5*v5;
+              }
+            }
          }
        }
      },fev);

      Kokkos::single(Kokkos::PerThread(team), [&] () {
-        f(i,0) += fev.f[0];
-        f(i,1) += fev.f[1];
-        f(i,2) += fev.f[2];
+        a_f(i,0) += fev.f[0];
+        a_f(i,1) += fev.f[1];
+        a_f(i,2) += fev.f[2];

-        if (c.eflag_global) {
+        if (c.eflag_global)
          ev.evdwl += fev.evdwl;
-          ev.ecoul += fev.ecoul;
-        }
-
-        if (c.eflag_atom)
-          d_eatom(i) += fev.evdwl + fev.ecoul;

        if (c.vflag_global) {
          ev.v[0] += fev.v[0];
@ -615,13 +758,19 @@ struct PairComputeFunctor  {
          ev.v[5] += fev.v[5];
        }

-        if (c.vflag_atom) {
-          d_vatom(i,0) += fev.v[0];
-          d_vatom(i,1) += fev.v[1];
-          d_vatom(i,2) += fev.v[2];
-          d_vatom(i,3) += fev.v[3];
-          d_vatom(i,4) += fev.v[4];
-          d_vatom(i,5) += fev.v[5];
+        if (NEIGHFLAG == FULL) {
+
+          if (c.eflag_atom)
+            a_eatom(i) += fev.evdwl;
+
+          if (c.vflag_atom) {
+            a_vatom(i,0) += fev.v[0];
+            a_vatom(i,1) += fev.v[1];
+            a_vatom(i,2) += fev.v[2];
+            a_vatom(i,3) += fev.v[3];
+            a_vatom(i,4) += fev.v[4];
+            a_vatom(i,5) += fev.v[5];
+          }
        }
      });
    });
@ -636,7 +785,7 @@ struct PairComputeFunctor  {
    auto a_eatom = dup_eatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
    auto a_vatom = dup_vatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();

-    const int EFLAG = c.eflag;
+    const int EFLAG = c.eflag_either;
    const int NEWTON_PAIR = c.newton_pair;
    const int VFLAG = c.vflag_either;

@ -657,7 +806,7 @@ struct PairComputeFunctor  {
      const E_FLOAT v5 = dely*delz*fpair;

      if (c.vflag_global) {
-        if (NEIGHFLAG!=FULL) {
+        if (NEIGHFLAG != FULL) {
          if (NEWTON_PAIR) {
            ev.v[0] += v0;
            ev.v[1] += v1;
@ -747,7 +896,8 @@ struct PairComputeFunctor  {
 // This uses the fact that failure to match template parameters is not an error.
 // By having the enable_if with a ! and without it, exactly one of the functions
 // pair_compute_neighlist will match - either the dummy version
-// or the real one further below.
+// or the real one further below
+
 template<class PairStyle, unsigned NEIGHFLAG, int ZEROFLAG = 0, class Specialisation = void>
 EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<!((NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0), NeighListKokkos<typename PairStyle::device_type>*> list) {
  EV_FLOAT ev;
@ -757,24 +907,29 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<!((NEIGHFLAG
  return ev;
 }

+template<class NeighStyle>
+int GetMaxNeighs(NeighStyle* list)
+{
+  auto d_ilist = list->d_ilist;
+  auto d_numneigh = list->d_numneigh;
+  int inum = list->inum;
+
+  int maxneigh = 0;
+  Kokkos::parallel_reduce(inum, LAMMPS_LAMBDA(const int ii, int &maxneigh) {
+    const int i = d_ilist[ii];
+    const int num_neighs = d_numneigh[i];
+    maxneigh = MAX(maxneigh,num_neighs);
+  }, Kokkos::Max<int>(maxneigh));
+
+  return maxneigh;
+}
+
 template<class DeviceType, class FunctorStyle>
-int GetTeamSize(FunctorStyle& KOKKOS_GPU_ARG(functor), int KOKKOS_GPU_ARG(inum),
-                int KOKKOS_GPU_ARG(reduce_flag), int team_size, int KOKKOS_GPU_ARG(vector_length)) {
-
-#ifdef LMP_KOKKOS_GPU
-    int team_size_max;
-
-    if (reduce_flag)
-      team_size_max = Kokkos::TeamPolicy<DeviceType>(inum,Kokkos::AUTO).team_size_max(functor,Kokkos::ParallelReduceTag());
-    else
-      team_size_max = Kokkos::TeamPolicy<DeviceType>(inum,Kokkos::AUTO).team_size_max(functor,Kokkos::ParallelForTag());
-
-    if (team_size*vector_length > team_size_max)
-      team_size = team_size_max/vector_length;
-#else
-    team_size = 1;
-#endif
-    return team_size;
+void GetMaxTeamSize(FunctorStyle& functor, int inum,
+                int &teamsize_max_for, int &teamsize_max_reduce)
+{
+  teamsize_max_for = Kokkos::TeamPolicy<DeviceType>(inum,Kokkos::AUTO).team_size_max(functor,Kokkos::ParallelForTag());
+  teamsize_max_reduce = Kokkos::TeamPolicy<DeviceType>(inum,Kokkos::AUTO).team_size_max(functor,Kokkos::ParallelReduceTag());
 }

 // Submit ParallelFor for NEIGHFLAG=HALF,HALFTHREAD,FULL
@ -782,38 +937,77 @@ template<class PairStyle, unsigned NEIGHFLAG, int ZEROFLAG = 0, class Specialisa
 EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0, NeighListKokkos<typename PairStyle::device_type>*> list) {
  EV_FLOAT ev;

+  const int inum = list->inum;
+
  if (!fpair->lmp->kokkos->neigh_thread_set)
-    if (list->inum <= 16384 && NEIGHFLAG == FULL)
-      fpair->lmp->kokkos->neigh_thread = 1;
+    if (fpair->lmp->kokkos->ngpus && inum <= 16000)
+      if (NEIGHFLAG == FULL || !fpair->newton_pair)
+        fpair->lmp->kokkos->neigh_thread = 1;

  if (fpair->lmp->kokkos->neigh_thread) {

-    int vector_length = 8;
-    int atoms_per_team = 32;
+    static int vectorsize = 0;
+    static int atoms_per_team = 0;
+    static int lastcall = -1;
+
+#if defined(LMP_KOKKOS_GPU)
+    if (!vectorsize || lastcall < fpair->lmp->neighbor->lastcall) {
+      lastcall = fpair->lmp->update->ntimestep;
+      vectorsize = GetMaxNeighs(list);
+      vectorsize = MathSpecial::powint(2,(int(log2(vectorsize) + 0.5))); // round to nearest power of 2
+
+  #if defined(KOKKOS_ENABLE_HIP)
+      int max_vectorsize = 64;
+  #else
+      int max_vectorsize = 32;
+  #endif
+
+      vectorsize = MIN(vectorsize,max_vectorsize);
+
+      int teamsize_max_for,teamsize_max_reduce;
+      if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
+        PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list);
+        GetMaxTeamSize<typename PairStyle::device_type>(ff, inum, teamsize_max_for, teamsize_max_reduce);
+      } else {
+        PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list);
+        GetMaxTeamSize<typename PairStyle::device_type>(ff, inum, teamsize_max_for, teamsize_max_reduce);
+      }
+
+      int teamsize_max = teamsize_max_for;
+      if (fpair->eflag || fpair->vflag)
+        teamsize_max = teamsize_max_reduce;
+      atoms_per_team = teamsize_max/vectorsize;
+    }
+#else
+    vectorsize = 1;
+    atoms_per_team = 1;
+#endif
+
+    const int num_teams = inum / atoms_per_team + (inum % atoms_per_team ? 1 : 0);

    if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
      PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list);
-      atoms_per_team = GetTeamSize<typename PairStyle::device_type>(ff, list->inum, (fpair->eflag || fpair->vflag), atoms_per_team, vector_length);
-      Kokkos::TeamPolicy<typename PairStyle::device_type,Kokkos::IndexType<int> > policy(list->inum,atoms_per_team,vector_length);
+      Kokkos::TeamPolicy<typename PairStyle::device_type,Kokkos::IndexType<int> > policy(num_teams,atoms_per_team,vectorsize);
      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev);
      else                              Kokkos::parallel_for(policy,ff);
+      ff.contribute();
    } else {
      PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list);
-      atoms_per_team = GetTeamSize<typename PairStyle::device_type>(ff, list->inum, (fpair->eflag || fpair->vflag), atoms_per_team, vector_length);
-      Kokkos::TeamPolicy<typename PairStyle::device_type,Kokkos::IndexType<int> > policy(list->inum,atoms_per_team,vector_length);
+      Kokkos::TeamPolicy<typename PairStyle::device_type,Kokkos::IndexType<int> > policy(num_teams,atoms_per_team,vectorsize);
      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev);
      else                              Kokkos::parallel_for(policy,ff);
+      ff.contribute();
    }
  } else {
    if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
      PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list);
-      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
-      else                              Kokkos::parallel_for(list->inum,ff);
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(inum,ff,ev);
+      else                              Kokkos::parallel_for(inum,ff);
      ff.contribute();
    } else {
      PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list);
-      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
-      else                              Kokkos::parallel_for(list->inum,ff);
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(inum,ff,ev);
+      else                              Kokkos::parallel_for(inum,ff);
      ff.contribute();
    }
  }
--- a/src/ML-PACE/compute_pace.cpp
+++ b/src/ML-PACE/compute_pace.cpp
@ -45,10 +45,10 @@ struct ACECimpl {

 using namespace LAMMPS_NS;

-enum{SCALAR,VECTOR,ARRAY};
+enum { SCALAR, VECTOR, ARRAY };
 ComputePACE::ComputePACE(LAMMPS *lmp, int narg, char **arg) :
-  Compute(lmp, narg, arg), cutsq(nullptr), list(nullptr), pace(nullptr),
-  paceall(nullptr), pace_peratom(nullptr), map(nullptr), cg(nullptr)
+  Compute(lmp, narg, arg), cutsq(nullptr), list(nullptr), pace(nullptr), paceall(nullptr),
+  pace_peratom(nullptr), map(nullptr), cg(nullptr), c_pe(nullptr), c_virial(nullptr)
 {
  array_flag = 1;
  extarray = 0;
@ -71,30 +71,24 @@ ComputePACE::ComputePACE(LAMMPS *lmp, int narg, char **arg) :
  //read in file with CG coefficients or c_tilde coefficients

  auto potential_file_name = utils::get_potential_file_path(arg[3]);
-  delete acecimpl -> basis_set;
-  acecimpl -> basis_set = new ACECTildeBasisSet(potential_file_name);
-  cutmax = acecimpl -> basis_set->cutoffmax;
+  delete acecimpl->basis_set;
+  acecimpl->basis_set = new ACECTildeBasisSet(potential_file_name);
+  cutmax = acecimpl->basis_set->cutoffmax;

  //# of rank 1, rank > 1 functions

  int n_r1, n_rp = 0;
-  n_r1 = acecimpl -> basis_set->total_basis_size_rank1[0];
-  n_rp = acecimpl -> basis_set->total_basis_size[0];
+  n_r1 = acecimpl->basis_set->total_basis_size_rank1[0];
+  n_rp = acecimpl->basis_set->total_basis_size[0];

  int ncoeff = n_r1 + n_rp;
-
-  //int nvalues = ncoeff;
-
  nvalues = ncoeff;

-  //-----------------------------------------------------------
-  //nperdim = ncoeff;
-
  ndims_force = 3;
  ndims_virial = 6;
  bik_rows = 1;
-  yoffset = nvalues; //nperdim;
-  zoffset = 2*nvalues; //nperdim;
+  yoffset = nvalues;
+  zoffset = 2*nvalues;
  natoms = atom->natoms;
  if (bikflag) bik_rows = natoms;
    dgrad_rows = ndims_force*natoms;
@ -247,28 +241,27 @@ void ComputePACE::compute_array()
      const int typeoffset_local = ndims_peratom*nvalues*(itype-1);
      const int typeoffset_global = nvalues*(itype-1);

-      delete acecimpl -> ace;
-      acecimpl -> ace = new ACECTildeEvaluator(*acecimpl -> basis_set);
-      acecimpl -> ace->compute_projections = 1;
-      acecimpl -> ace->compute_b_grad = 1;
+      delete acecimpl->ace;
+      acecimpl->ace = new ACECTildeEvaluator(*acecimpl->basis_set);
+      acecimpl->ace->compute_projections = true;
+      acecimpl->ace->compute_b_grad = true;
      int n_r1, n_rp = 0;
-      n_r1 = acecimpl -> basis_set->total_basis_size_rank1[0];
-      n_rp = acecimpl -> basis_set->total_basis_size[0];
+      n_r1 = acecimpl->basis_set->total_basis_size_rank1[0];
+      n_rp = acecimpl->basis_set->total_basis_size[0];

      int ncoeff = n_r1 + n_rp;
-      acecimpl -> ace->element_type_mapping.init(ntypes+1);
+      acecimpl->ace->element_type_mapping.init(ntypes+1);
      for (int ik = 1; ik <= ntypes; ik++) {
-        for(int mu = 0; mu < acecimpl -> basis_set ->nelements; mu++){
+        for(int mu = 0; mu < acecimpl->basis_set->nelements; mu++){
          if (mu != -1) {
            if (mu == ik - 1) {
              map[ik] = mu;
-              acecimpl -> ace->element_type_mapping(ik) = mu;
+              acecimpl->ace->element_type_mapping(ik) = mu;
            }
          }
        }
      }

-
      if (dgradflag) {

        // dBi/dRi tags
@ -299,9 +292,9 @@ void ComputePACE::compute_array()
      }

      // resize the neighbor cache after setting the basis
-      acecimpl -> ace->resize_neighbours_cache(max_jnum);
-      acecimpl -> ace->compute_atom(i, atom->x, atom->type, list->numneigh[i], list->firstneigh[i]);
-      Array1D<DOUBLE_TYPE> Bs = acecimpl -> ace -> projections;
+      acecimpl->ace->resize_neighbours_cache(max_jnum);
+      acecimpl->ace->compute_atom(i, atom->x, atom->type, list->numneigh[i], list->firstneigh[i]);
+      Array1D<DOUBLE_TYPE> Bs = acecimpl->ace->projections;

      for (int jj = 0; jj < jnum; jj++) {
        const int j = jlist[jj];
@ -314,9 +307,9 @@ void ComputePACE::compute_array()
          // dimension: (n_descriptors,max_jnum,3)
          //example to access entries for neighbour jj after running compute_atom for atom i:
          for (int func_ind =0; func_ind < n_r1 + n_rp; func_ind++){
-            DOUBLE_TYPE fx_dB = acecimpl -> ace -> neighbours_dB(func_ind,jj,0);
-            DOUBLE_TYPE fy_dB = acecimpl -> ace -> neighbours_dB(func_ind,jj,1);
-            DOUBLE_TYPE fz_dB = acecimpl -> ace -> neighbours_dB(func_ind,jj,2);
+            DOUBLE_TYPE fx_dB = acecimpl->ace->neighbours_dB(func_ind,jj,0);
+            DOUBLE_TYPE fy_dB = acecimpl->ace->neighbours_dB(func_ind,jj,1);
+            DOUBLE_TYPE fz_dB = acecimpl->ace->neighbours_dB(func_ind,jj,2);
            pacedi[func_ind] += fx_dB;
            pacedi[func_ind+yoffset] += fy_dB;
            pacedi[func_ind+zoffset] += fz_dB;
@ -325,15 +318,13 @@ void ComputePACE::compute_array()
            pacedj[func_ind+zoffset] -= fz_dB;
            }
         } else {
-            //printf("inside dBi/dRj logical : ncoeff = %d \n", ncoeff);
            for (int iicoeff = 0; iicoeff < ncoeff; iicoeff++) {

              // add to pace array for this proc
-              //printf("inside dBi/dRj loop\n");
              // dBi/dRj
-              DOUBLE_TYPE fx_dB = acecimpl -> ace -> neighbours_dB(iicoeff,jj,0);
-              DOUBLE_TYPE fy_dB = acecimpl -> ace -> neighbours_dB(iicoeff,jj,1);
-              DOUBLE_TYPE fz_dB = acecimpl -> ace -> neighbours_dB(iicoeff,jj,2);
+              DOUBLE_TYPE fx_dB = acecimpl->ace->neighbours_dB(iicoeff,jj,0);
+              DOUBLE_TYPE fy_dB = acecimpl->ace->neighbours_dB(iicoeff,jj,1);
+              DOUBLE_TYPE fz_dB = acecimpl->ace->neighbours_dB(iicoeff,jj,2);
              pace[bik_rows + ((atom->tag[j]-1)*3*natoms) + 3*(atom->tag[i]-1) + 0][iicoeff+3] -= fx_dB;
              pace[bik_rows + ((atom->tag[j]-1)*3*natoms) + 3*(atom->tag[i]-1) + 1][iicoeff+3] -= fy_dB;
              pace[bik_rows + ((atom->tag[j]-1)*3*natoms) + 3*(atom->tag[i]-1) + 2][iicoeff+3] -= fz_dB;
--- a/src/ML-PACE/compute_pace.h
+++ b/src/ML-PACE/compute_pace.h
@ -35,15 +35,13 @@ class ComputePACE : public Compute {

 private:
  int natoms, nmax, size_peratom, lastcol;
-  int ncoeff, nvalues, nperdim, yoffset, zoffset;
+  int nvalues, yoffset, zoffset;
  int ndims_peratom, ndims_force, ndims_virial;
  double **cutsq;
  class NeighList *list;
  double **pace, **paceall;
  double **pace_peratom;
-  double rcutfac;
  int *map;    // map types to [0,nelements)
-  int nelements, chemflag;
  int bikflag, bik_rows, dgradflag, dgrad_rows;
  double *cg;
  double cutmax;
--- a/src/ML-SNAP/compute_sna_atom.cpp
+++ b/src/ML-SNAP/compute_sna_atom.cpp
@ -299,7 +299,7 @@ void ComputeSNAAtom::compute_peratom()
      // ############################################################################## //
      // ##### Start of section for computing bispectrum on nnn nearest neighbors ##### //
      // ############################################################################## //
-      if (nearest_neighbors_mode == true) {
+      if (nearest_neighbors_mode) {
        // ##### 1) : consider full neighbor list in rlist
        memory->create(distsq, jnum, "snann/atom:distsq");
        memory->create(rlist, jnum, 3, "snann/atom:rlist");
@ -308,7 +308,6 @@ void ComputeSNAAtom::compute_peratom()
        for (int jj = 0; jj < jnum; jj++) {
          int j = jlist[jj];
          j &= NEIGHMASK;
-          int jtype = type[j];

          const double delx = xtmp - x[j][0];
          const double dely = ytmp - x[j][1];
@ -614,10 +613,9 @@ double * ComputeSNAAtom::tanh_weights(double * rsq, double rcut, double delta, i
  return w;
 }

-double ComputeSNAAtom::sum_weights(double * rsq, double * w, int ncounts)
+double ComputeSNAAtom::sum_weights(double * /*rsq*/, double * w, int ncounts)
 {
  double S=0.;
-  double rloc=0.;
  for (int i=0; i<ncounts; i++)
    {
      S += w[i];
@ -627,7 +625,7 @@ double ComputeSNAAtom::sum_weights(double * rsq, double * w, int ncounts)

 double ComputeSNAAtom::get_target_rcut(double S_target, double * rsq, double rcut, int ncounts, int weightmode, double delta)
 {
-  double S_sol;
+  double S_sol = 0.0;
  if (weightmode == 0) {
    double * www = weights(rsq, rcut, ncounts);
    S_sol = sum_weights(rsq, www, ncounts);
--- a/src/MOLECULE/fix_cmap.cpp
+++ b/src/MOLECULE/fix_cmap.cpp
@ -39,6 +39,7 @@
 #include "memory.h"
 #include "potential_file_reader.h"
 #include "respa.h"
+#include "text_file_reader.h"
 #include "update.h"

 #include <cmath>
@ -49,15 +50,14 @@ using namespace LAMMPS_NS;
 using namespace FixConst;
 using namespace MathConst;

-#define MAXLINE 256
-#define LISTDELTA 10000
-#define LB_FACTOR 1.5
+static constexpr int LISTDELTA = 10000;
+static constexpr double LB_FACTOR = 1.5;

-#define CMAPMAX 6   // max # of CMAP terms stored by one atom
-#define CMAPDIM 24  // grid map dimension is 24 x 24
-#define CMAPXMIN -360.0
-#define CMAPXMIN2 -180.0
-#define CMAPDX 15.0 // 360/CMAPDIM
+static constexpr int CMAPMAX = 6;   // max # of CMAP terms stored by one atom
+static constexpr int CMAPDIM = 24;  // grid map dimension is 24 x 24
+static constexpr double CMAPXMIN = -360.0;
+static constexpr double CMAPXMIN2 = -180.0;
+static constexpr double CMAPDX = 15.0; // 360/CMAPDIM

 /* ---------------------------------------------------------------------- */

@ -86,17 +86,15 @@ FixCMAP::FixCMAP(LAMMPS *lmp, int narg, char **arg) :
  wd_section = 1;
  respa_level_support = 1;
  ilevel_respa = 0;
-
-  MPI_Comm_rank(world,&me);
-  MPI_Comm_size(world,&nprocs);
+  eflag_caller = 1;

  // allocate memory for CMAP data

  memory->create(g_axis,CMAPDIM,"cmap:g_axis");
-  memory->create(cmapgrid,6,CMAPDIM,CMAPDIM,"cmap:grid");
-  memory->create(d1cmapgrid,6,CMAPDIM,CMAPDIM,"cmap:d1grid");
-  memory->create(d2cmapgrid,6,CMAPDIM,CMAPDIM,"cmap:d2grid");
-  memory->create(d12cmapgrid,6,CMAPDIM,CMAPDIM,"cmap:d12grid");
+  memory->create(cmapgrid,CMAPMAX,CMAPDIM,CMAPDIM,"cmap:grid");
+  memory->create(d1cmapgrid,CMAPMAX,CMAPDIM,CMAPDIM,"cmap:d1grid");
+  memory->create(d2cmapgrid,CMAPMAX,CMAPDIM,CMAPDIM,"cmap:d2grid");
+  memory->create(d12cmapgrid,CMAPMAX,CMAPDIM,CMAPDIM,"cmap:d12grid");

  // read and setup CMAP data

@ -184,10 +182,6 @@ void FixCMAP::init()
  for (i = 0; i < 6; i++)
    set_map_derivatives(cmapgrid[i],d1cmapgrid[i],d2cmapgrid[i],d12cmapgrid[i]);

-  // define newton_bond here in case restart file was read (not data file)
-
-  newton_bond = force->newton_bond;
-
  if (utils::strmatch(update->integrate_style,"^respa")) {
    ilevel_respa = (dynamic_cast<Respa *>(update->integrate))->nlevels-1;
    if (respa_level >= 0) ilevel_respa = MIN(respa_level,ilevel_respa);
@ -238,6 +232,8 @@ void FixCMAP::min_setup(int vflag)
 void FixCMAP::pre_neighbor()
 {
  int i,m,atom1,atom2,atom3,atom4,atom5;
+  const int me = comm->me;
+  const int nprocs = comm->nprocs;

  // guesstimate initial length of local crossterm list
  // if ncmap was not set (due to read_restart, no read_data),
@ -637,15 +633,22 @@ void FixCMAP::read_grid_map(char *cmapfile)
 {
  if (comm->me == 0) {
    try {
-      memset(&cmapgrid[0][0][0], 0, 6*CMAPDIM*CMAPDIM*sizeof(double));
+      ncrosstermtypes = 0;
+      memset(&cmapgrid[0][0][0], 0, CMAPMAX*CMAPDIM*CMAPDIM*sizeof(double));
+      utils::logmesg(lmp, "Reading CMAP parameters from: {}\n", cmapfile);
      PotentialFileReader reader(lmp, cmapfile, "cmap grid");

-      // there are six maps in this order.
+      // there may be up to six maps.
+      // the charmm36.cmap file has in this order.
      // alanine, alanine-proline, proline, proline-proline, glycine, glycine-proline.
-      // read as one big blob of numbers while ignoring comments
-
-      reader.next_dvector(&cmapgrid[0][0][0],6*CMAPDIM*CMAPDIM);
+      // custom CMAP files created by charmm-gui may have fewer entries
+      // read one map at a time as a blob of numbers while ignoring comments
+      // and stop reading when whe have reached EOF.
+      for (ncrosstermtypes = 0; ncrosstermtypes < CMAPMAX; ++ncrosstermtypes)
+        reader.next_dvector(&cmapgrid[ncrosstermtypes][0][0],CMAPDIM*CMAPDIM);

+    } catch (EOFException &) {
+      utils::logmesg(lmp, "  Read in CMAP data for {} crossterm types\n", ncrosstermtypes);
    } catch (std::exception &e) {
      error->one(FLERR,"Error reading CMAP potential file: {}", e.what());
    }
@ -934,10 +937,6 @@ void FixCMAP::read_data_header(char *line)
  } catch (std::exception &e) {
    error->all(FLERR,"Invalid read data header line for fix cmap: {}", e.what());
  }
-
-  // not set in constructor because this fix could be defined before newton command
-
-  newton_bond = force->newton_bond;
 }

 /* ----------------------------------------------------------------------
@ -957,10 +956,10 @@ void FixCMAP::read_data_section(char * /*keyword*/, int /*n*/, char *buf,

  // loop over lines of CMAP crossterms
  // tokenize the line into values
-  // add crossterm to one of my atoms, depending on newton_bond
+  // add crossterm to one of my atoms

  for (const auto &line : lines) {
-    ValueTokenizer values(line);
+    ValueTokenizer values(utils::trim_comment(line));
    try {
      values.skip();
      itype = values.next_int();
--- a/src/MOLECULE/fix_cmap.h
+++ b/src/MOLECULE/fix_cmap.h
@ -65,8 +65,7 @@ class FixCMAP : public Fix {
  double memory_usage() override;

 private:
-  int nprocs, me;
-  int newton_bond, eflag_caller;
+  int eflag_caller;
  int ctype, ilevel_respa;
  int ncrosstermtypes, crossterm_per_atom, maxcrossterm;
  int ncrosstermlist;
--- a/src/OPENMP/npair_bin_ghost_omp.cpp
+++ b/src/OPENMP/npair_bin_ghost_omp.cpp
@ -59,7 +59,7 @@ void NPairBinGhostOmp<HALF>::build(NeighList *list)
 #endif
  NPAIR_OMP_SETUP(nall);

-  int i, j, k, n, itype, jtype, ibin, bin_start, which, imol, iatom;
+  int i, j, k, n, itype, jtype, ibin, which, imol, iatom;
  tagint tagprev;
  double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
  int xbin, ybin, zbin, xbin2, ybin2, zbin2;
--- a/src/PYTHON/python_impl.cpp
+++ b/src/PYTHON/python_impl.cpp
@ -17,6 +17,7 @@

 #include "python_impl.h"

+#include "comm.h"
 #include "error.h"
 #include "input.h"
 #include "memory.h"
@ -56,7 +57,6 @@
 #endif
 #include "mliap_unified_couple_kokkos.h"

-
 #endif
 #endif

@ -79,7 +79,7 @@ PythonImpl::PythonImpl(LAMMPS *lmp) : Pointers(lmp)
  // Force the stdout and stderr streams to be unbuffered.
  bool unbuffered = PYTHONUNBUFFERED != nullptr && strcmp(PYTHONUNBUFFERED, "1") == 0;

-#if (PY_VERSION_HEX >= 0x030800f0) && !defined(__APPLE__)
+#if (PY_VERSION_HEX >= 0x030800f0)
  PyConfig config;
  PyConfig_InitPythonConfig(&config);
  config.buffered_stdio = !unbuffered;
@ -87,28 +87,30 @@ PythonImpl::PythonImpl(LAMMPS *lmp) : Pointers(lmp)
  // Python Global configuration variable
  Py_UnbufferedStdioFlag = unbuffered;
 #endif
-#else
-#warning Cannot force stdout and stderr to be unbuffered
 #endif

 #ifdef MLIAP_PYTHON
-  // Inform python intialization scheme of the mliappy module.
-  // This -must- happen before python is initialized.
-  int err = PyImport_AppendInittab("mliap_model_python_couple", PyInit_mliap_model_python_couple);
-  if (err) error->all(FLERR, "Could not register MLIAPPY embedded python module.");
+  // cannot register mliappy module a second time
+  if (!Py_IsInitialized()) {
+    // Inform python intialization scheme of the mliappy module.
+    // This -must- happen before python is initialized.
+    int err = PyImport_AppendInittab("mliap_model_python_couple", PyInit_mliap_model_python_couple);
+    if (err) error->all(FLERR, "Could not register MLIAPPY embedded python module.");
+
+    err = PyImport_AppendInittab("mliap_unified_couple", PyInit_mliap_unified_couple);
+    if (err) error->all(FLERR, "Could not register MLIAPPY unified embedded python module.");

-  err = PyImport_AppendInittab("mliap_unified_couple", PyInit_mliap_unified_couple);
-  if (err) error->all(FLERR, "Could not register MLIAPPY unified embedded python module.");
 #ifdef LMP_KOKKOS
-  // Inform python intialization scheme of the mliappy module.
-  // This -must- happen before python is initialized.
-  err = PyImport_AppendInittab("mliap_model_python_couple_kokkos", PyInit_mliap_model_python_couple_kokkos);
-  if (err) error->all(FLERR, "Could not register MLIAPPY embedded python module.");
-
-  err = PyImport_AppendInittab("mliap_unified_couple_kokkos", PyInit_mliap_unified_couple_kokkos);
-  if (err) error->all(FLERR, "Could not register MLIAPPY unified embedded python module.");
+    // Inform python intialization scheme of the mliappy module.
+    // This -must- happen before python is initialized.
+    err = PyImport_AppendInittab("mliap_model_python_couple_kokkos",
+                                 PyInit_mliap_model_python_couple_kokkos);
+    if (err) error->all(FLERR, "Could not register MLIAPPY embedded python KOKKOS module.");

+    err = PyImport_AppendInittab("mliap_unified_couple_kokkos", PyInit_mliap_unified_couple_kokkos);
+    if (err) error->all(FLERR, "Could not register MLIAPPY unified embedded python KOKKOS module.");
 #endif
+  }
 #endif

 #if PY_VERSION_HEX >= 0x030800f0 && !defined(Py_LIMITED_API)
@ -122,7 +124,7 @@ PythonImpl::PythonImpl(LAMMPS *lmp) : Pointers(lmp)
  // With Python 3.7 this function is now called by Py_Initialize()
  // Deprecated since version 3.9, will be removed in version 3.11
 #if PY_VERSION_HEX < 0x030700f0
-  if (!PyEval_ThreadsInitialized()) { PyEval_InitThreads(); }
+  if (!PyEval_ThreadsInitialized()) PyEval_InitThreads();
 #endif

  PyUtils::GIL lock;
--- a/src/REAXFF/compute_reaxff_atom.cpp
+++ b/src/REAXFF/compute_reaxff_atom.cpp
@ -34,17 +34,17 @@ using namespace ReaxFF;
 /* ---------------------------------------------------------------------- */

 ComputeReaxFFAtom::ComputeReaxFFAtom(LAMMPS *lmp, int narg, char **arg) :
-  Compute(lmp, narg, arg),
-  abo(nullptr), neighid(nullptr), bondcount(nullptr), reaxff(nullptr)
+    Compute(lmp, narg, arg), neighid(nullptr), abo(nullptr), bondcount(nullptr), reaxff(nullptr)
 {
  if (atom->tag_consecutive() == 0)
-    error->all(FLERR,"Atom IDs must be consecutive for compute reaxff/atom");
+    error->all(FLERR, "Atom IDs must be consecutive for compute reaxff/atom");

  peratom_flag = 1;

  // initialize output

  nlocal = -1;
+  nbonds = 0;
  prev_nbonds = -1;

  size_peratom_cols = 3;
--- a/src/REAXFF/fix_reaxff.cpp
+++ b/src/REAXFF/fix_reaxff.cpp
@ -19,7 +19,7 @@
   Please cite the related publication:
   H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
   "Parallel Reactive Molecular Dynamics: Numerical Methods and
-   Algorithmic Techniques", Parallel Computing, in press.
+   Algorithmic Techniques", Parallel Computing, 38 (4-5), 245-259.
 ------------------------------------------------------------------------- */

 #include "fix_reaxff.h"
--- a/src/REAXFF/reaxff_allocate.cpp
+++ b/src/REAXFF/reaxff_allocate.cpp
@ -11,7 +11,7 @@
  Please cite the related publication:
  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
  "Parallel Reactive Molecular Dynamics: Numerical Methods and
-  Algorithmic Techniques", Parallel Computing, in press.
+  Algorithmic Techniques", Parallel Computing, 38 (4-5), 245-259.

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
--- a/src/REAXFF/reaxff_control.cpp
+++ b/src/REAXFF/reaxff_control.cpp
@ -11,7 +11,7 @@
  Please cite the related publication:
  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
  "Parallel Reactive Molecular Dynamics: Numerical Methods and
-  Algorithmic Techniques", Parallel Computing, in press.
+  Algorithmic Techniques", Parallel Computing, 38 (4-5), 245-259.

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
--- a/src/REAXFF/reaxff_ffield.cpp
+++ b/src/REAXFF/reaxff_ffield.cpp
@ -11,7 +11,7 @@
  Please cite the related publication:
  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
  "Parallel Reactive Molecular Dynamics: Numerical Methods and
-  Algorithmic Techniques", Parallel Computing, in press.
+  Algorithmic Techniques", Parallel Computing,  38, 245-259 (2012).

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
@ -30,6 +30,7 @@
 #include "error.h"
 #include "memory.h"
 #include "text_file_reader.h"
+#include "tokenizer.h"
 #include "utils.h"

 #include <cmath>
@ -40,6 +41,8 @@
 using LAMMPS_NS::utils::open_potential;
 using LAMMPS_NS::utils::getsyserror;
 using LAMMPS_NS::utils::uppercase;
+using LAMMPS_NS::EOFException;
+using LAMMPS_NS::ValueTokenizer;

 namespace ReaxFF {

@ -538,17 +541,20 @@ namespace ReaxFF {
          }
        }

-        // next line is number of hydrogen bond parameters
-
-        values = reader.next_values(0);
-        n = values.next_int();
-        ++lineno;
+        // next line is number of hydrogen bond parameters. that block may be missing

        for (i = 0; i < ntypes; ++i)
          for (j = 0; j < ntypes; ++j)
            for (k = 0; k < ntypes; ++k)
              hbp[i][j][k].r0_hb = -1.0;

+        auto thisline = reader.next_line();
+        if (!thisline) throw EOFException("ReaxFF parameter file has no hydrogen bond parameters");
+
+        values = ValueTokenizer(thisline);
+        n = values.next_int();
+        ++lineno;
+
        for (i = 0; i < n; ++i) {
          values = reader.next_values(0);
          ++lineno;
@ -570,6 +576,8 @@ namespace ReaxFF {
        }

        memory->destroy(tor_flag);
+      } catch (EOFException &e) {
+        error->warning(FLERR, e.what());
      } catch (std::exception &e) {
        error->one(FLERR,e.what());
      }
--- a/src/REAXFF/reaxff_forces.cpp
+++ b/src/REAXFF/reaxff_forces.cpp
@ -11,7 +11,7 @@
  Please cite the related publication:
  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
  "Parallel Reactive Molecular Dynamics: Numerical Methods and
-  Algorithmic Techniques", Parallel Computing, in press.
+  Algorithmic Techniques", Parallel Computing, 38 (4-5), 245-259.

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
--- a/src/REAXFF/reaxff_hydrogen_bonds.cpp
+++ b/src/REAXFF/reaxff_hydrogen_bonds.cpp
@ -11,7 +11,7 @@
  Please cite the related publication:
  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
  "Parallel Reactive Molecular Dynamics: Numerical Methods and
-  Algorithmic Techniques", Parallel Computing, in press.
+  Algorithmic Techniques", Parallel Computing, 38 (4-5), 245-259.

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
--- a/src/REAXFF/reaxff_list.cpp
+++ b/src/REAXFF/reaxff_list.cpp
@ -11,7 +11,7 @@
  Please cite the related publication:
  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
  "Parallel Reactive Molecular Dynamics: Numerical Methods and
-  Algorithmic Techniques", Parallel Computing, in press.
+  Algorithmic Techniques", Parallel Computing, 38 (4-5), 245-259.

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
--- a/src/REAXFF/reaxff_lookup.cpp
+++ b/src/REAXFF/reaxff_lookup.cpp
@ -11,7 +11,7 @@
  Please cite the related publication:
  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
  "Parallel Reactive Molecular Dynamics: Numerical Methods and
-  Algorithmic Techniques", Parallel Computing, in press.
+  Algorithmic Techniques", Parallel Computing, 38 (4-5), 245-259.

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
--- a/src/REAXFF/reaxff_multi_body.cpp
+++ b/src/REAXFF/reaxff_multi_body.cpp
@ -11,7 +11,7 @@
  Please cite the related publication:
  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
  "Parallel Reactive Molecular Dynamics: Numerical Methods and
-  Algorithmic Techniques", Parallel Computing, in press.
+  Algorithmic Techniques", Parallel Computing, 38 (4-5), 245-259.

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
--- a/src/REAXFF/reaxff_nonbonded.cpp
+++ b/src/REAXFF/reaxff_nonbonded.cpp
@ -11,7 +11,7 @@
  Please cite the related publication:
  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
  "Parallel Reactive Molecular Dynamics: Numerical Methods and
-  Algorithmic Techniques", Parallel Computing, in press.
+  Algorithmic Techniques", Parallel Computing, 38 (4-5), 245-259.

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
--- a/src/REAXFF/reaxff_reset_tools.cpp
+++ b/src/REAXFF/reaxff_reset_tools.cpp
@ -11,7 +11,7 @@
  Please cite the related publication:
  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
  "Parallel Reactive Molecular Dynamics: Numerical Methods and
-  Algorithmic Techniques", Parallel Computing, in press.
+  Algorithmic Techniques", Parallel Computing, 38 (4-5), 245-259.

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
--- a/src/REAXFF/reaxff_tool_box.cpp
+++ b/src/REAXFF/reaxff_tool_box.cpp
@ -10,7 +10,7 @@
  Please cite the related publication:
  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
  "Parallel Reactive Molecular Dynamics: Numerical Methods and
-  Algorithmic Techniques", Parallel Computing, in press.
+  Algorithmic Techniques", Parallel Computing, 38 (4-5), 245-259.

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
--- a/src/REAXFF/reaxff_torsion_angles.cpp
+++ b/src/REAXFF/reaxff_torsion_angles.cpp
@ -11,7 +11,7 @@
  Please cite the related publication:
  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
  "Parallel Reactive Molecular Dynamics: Numerical Methods and
-  Algorithmic Techniques", Parallel Computing, in press.
+  Algorithmic Techniques", Parallel Computing, 38 (4-5), 245-259.

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
--- a/src/REAXFF/reaxff_valence_angles.cpp
+++ b/src/REAXFF/reaxff_valence_angles.cpp
@ -11,7 +11,7 @@
  Please cite the related publication:
  H. M. Aktulga, J. C. Fogarty, S. A. Pandit, A. Y. Grama,
  "Parallel Reactive Molecular Dynamics: Numerical Methods and
-  Algorithmic Techniques", Parallel Computing, in press.
+  Algorithmic Techniques", Parallel Computing, 38 (4-5), 245-259.

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
--- a/src/RIGID/fix_rigid_nh_small.cpp
+++ b/src/RIGID/fix_rigid_nh_small.cpp
@ -219,7 +219,6 @@ void FixRigidNHSmall::init()
    }
  }

-  int icompute;
  if (tcomputeflag) {
    temperature = modify->get_compute_by_id(id_temp);
    if (!temperature)
--- a/src/compute_pair.cpp
+++ b/src/compute_pair.cpp
@ -30,7 +30,7 @@ enum { EPAIR, EVDWL, ECOUL };
 ComputePair::ComputePair(LAMMPS *lmp, int narg, char **arg) :
    Compute(lmp, narg, arg), pstyle(nullptr), pair(nullptr), one(nullptr)
 {
-  if (narg < 4) error->all(FLERR, "Illegal compute pair command");
+  if (narg < 4) utils::missing_cmd_args(FLERR, "compute pair", error);

  scalar_flag = 1;
  extscalar = 1;
@ -63,7 +63,7 @@ ComputePair::ComputePair(LAMMPS *lmp, int narg, char **arg) :
    else if (strcmp(arg[iarg], "ecoul") == 0)
      evalue = ECOUL;
    else
-      error->all(FLERR, "Illegal compute pair command");
+      error->all(FLERR, "Unknown compute pair keyword {}", arg[iarg]);
    ++iarg;
  }

@ -75,7 +75,7 @@ ComputePair::ComputePair(LAMMPS *lmp, int narg, char **arg) :
    pair = force->pair_match(pstyle, 1, nsub);
  }

-  if (!pair) error->all(FLERR, "Unrecognized pair style in compute pair command");
+  if (!pair) error->all(FLERR, "Unrecognized pair style {} in compute pair command", pstyle);
  npair = pair->nextra;

  if (npair) {
@ -104,7 +104,7 @@ void ComputePair::init()
  // recheck for pair style in case it has been deleted

  pair = force->pair_match(pstyle, 1, nsub);
-  if (!pair) error->all(FLERR, "Unrecognized pair style in compute pair command");
+  if (!pair) error->all(FLERR, "Unrecognized pair style {} in compute pair command", pstyle);
 }

 /* ---------------------------------------------------------------------- */
--- a/src/fix_balance.cpp
+++ b/src/fix_balance.cpp
@ -83,7 +83,7 @@ FixBalance::FixBalance(LAMMPS *lmp, int narg, char **arg) :
  // error checks

  if (lbstyle == SHIFT) {
-    int blen = bstr.size();
+    const int blen = bstr.size();
    for (int i = 0; i < blen; i++) {
      if (bstr[i] != 'x' && bstr[i] != 'y' && bstr[i] != 'z')
        error->all(FLERR,"Fix balance shift string is invalid");
--- a/src/fix_halt.cpp
+++ b/src/fix_halt.cpp
@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
@ -31,49 +30,49 @@
 using namespace LAMMPS_NS;
 using namespace FixConst;

-enum{BONDMAX,TLIMIT,DISKFREE,VARIABLE};
-enum{LT,LE,GT,GE,EQ,NEQ,XOR};
-enum{HARD,SOFT,CONTINUE};
-enum{NOMSG=0,YESMSG=1};
+enum { BONDMAX, TLIMIT, DISKFREE, VARIABLE };
+enum { LT, LE, GT, GE, EQ, NEQ, XOR };
+enum { HARD, SOFT, CONTINUE };
+enum { NOMSG = 0, YESMSG = 1 };

 /* ---------------------------------------------------------------------- */

 FixHalt::FixHalt(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg), idvar(nullptr), dlimit_path(nullptr)
+    Fix(lmp, narg, arg), idvar(nullptr), dlimit_path(nullptr)
 {
-  if (narg < 7) error->all(FLERR,"Illegal fix halt command");
-  nevery = utils::inumeric(FLERR,arg[3],false,lmp);
-  if (nevery <= 0) error->all(FLERR,"Illegal fix halt command");
+  if (narg < 7) utils::missing_cmd_args(FLERR, "fix halt", error);
+  nevery = utils::inumeric(FLERR, arg[3], false, lmp);
+  if (nevery <= 0) error->all(FLERR, "Illegal fix halt command: nevery must be > 0");

  // comparison args

  idvar = nullptr;
  int iarg = 4;

-  if (strcmp(arg[iarg],"tlimit") == 0) {
+  if (strcmp(arg[iarg], "tlimit") == 0) {
    attribute = TLIMIT;
-  } else if (strcmp(arg[iarg],"diskfree") == 0) {
+  } else if (strcmp(arg[iarg], "diskfree") == 0) {
    attribute = DISKFREE;
    dlimit_path = utils::strdup(".");
-  } else if (strcmp(arg[iarg],"bondmax") == 0) {
+  } else if (strcmp(arg[iarg], "bondmax") == 0) {
    attribute = BONDMAX;
  } else {
-    ArgInfo argi(arg[iarg],ArgInfo::VARIABLE);
+    ArgInfo argi(arg[iarg], ArgInfo::VARIABLE);

-    if ((argi.get_type() == ArgInfo::UNKNOWN)
-        || (argi.get_type() == ArgInfo::NONE)
-        || (argi.get_dim() != 0))
-      error->all(FLERR,"Invalid fix halt attribute");
+    if ((argi.get_type() == ArgInfo::UNKNOWN) || (argi.get_type() == ArgInfo::NONE) ||
+        (argi.get_dim() != 0))
+      error->all(FLERR, "Invalid fix halt attribute {}", arg[iarg]);

    attribute = VARIABLE;
    idvar = argi.copy_name();
    ivar = input->variable->find(idvar);

-    if (ivar < 0) error->all(FLERR,"Could not find fix halt variable name");
+    if (ivar < 0) error->all(FLERR, "Could not find fix halt variable name");
    if (input->variable->equalstyle(ivar) == 0)
-      error->all(FLERR,"Fix halt variable is not equal-style variable");
+      error->all(FLERR, "Fix halt variable is not equal-style variable");
  }

+  // clang-format off
  ++iarg;
  if (strcmp(arg[iarg],"<") == 0) operation = LT;
  else if (strcmp(arg[iarg],"<=") == 0) operation = LE;
@ -85,7 +84,7 @@ FixHalt::FixHalt(LAMMPS *lmp, int narg, char **arg) :
  else error->all(FLERR,"Invalid fix halt operator");

  ++iarg;
-  value = utils::numeric(FLERR,arg[iarg],false,lmp);
+  value = utils::numeric(FLERR, arg[iarg], false, lmp);

  // parse optional args

@ -93,38 +92,40 @@ FixHalt::FixHalt(LAMMPS *lmp, int narg, char **arg) :
  msgflag = YESMSG;
  ++iarg;
  while (iarg < narg) {
-    if (strcmp(arg[iarg],"error") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix halt command");
-      if (strcmp(arg[iarg+1],"hard") == 0) eflag = HARD;
-      else if (strcmp(arg[iarg+1],"soft") == 0) eflag = SOFT;
-      else if (strcmp(arg[iarg+1],"continue") == 0) eflag = CONTINUE;
-      else error->all(FLERR,"Illegal fix halt command");
+    if (strcmp(arg[iarg], "error") == 0) {
+      if (iarg + 2 > narg) utils::missing_cmd_args(FLERR, "fix halt error", error);
+      if (strcmp(arg[iarg + 1], "hard") == 0) eflag = HARD;
+      else if (strcmp(arg[iarg + 1], "soft") == 0) eflag = SOFT;
+      else if (strcmp(arg[iarg + 1], "continue") == 0) eflag = CONTINUE;
+      else error->all(FLERR, "Unknown fix halt error condition {}", arg[iarg]);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"message") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix halt command");
-      msgflag = utils::logical(FLERR,arg[iarg+1],false,lmp);
+    } else if (strcmp(arg[iarg], "message") == 0) {
+      if (iarg + 2 > narg) utils::missing_cmd_args(FLERR, "fix halt message", error);
+      msgflag = utils::logical(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"path") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix halt command");
+    } else if (strcmp(arg[iarg], "path") == 0) {
+      if (iarg + 2 > narg) utils::missing_cmd_args(FLERR, "fix halt error", error);
      ++iarg;
      delete[] dlimit_path;
      // strip off outer quotes, if present
-      int len = strlen(arg[iarg])+1;
-      if ( ((arg[iarg][0] == '"') || (arg[iarg][0] == '\''))
-           && (arg[iarg][0] == arg[iarg][len-2])) {
-        arg[iarg][len-2] = '\0';
-        dlimit_path = utils::strdup(arg[iarg]+1);
-      } else dlimit_path = utils::strdup(arg[iarg]);
+      int len = strlen(arg[iarg]) + 1;
+      if (((arg[iarg][0] == '"') || (arg[iarg][0] == '\'')) &&
+          (arg[iarg][0] == arg[iarg][len - 2])) {
+        arg[iarg][len - 2] = '\0';
+        dlimit_path = utils::strdup(arg[iarg] + 1);
+      } else
+        dlimit_path = utils::strdup(arg[iarg]);
      ++iarg;
-    } else error->all(FLERR,"Illegal fix halt command");
+    } else error->all(FLERR, "Unknown fix halt keyword {}", arg[iarg]);
  }
+  // clang-format on

  // add nfirst to all computes that store invocation times
  // since don't know a priori which are invoked via variables by this fix
  // once in end_of_step() can set timestep for ones actually invoked

  if (attribute == VARIABLE) {
-    const bigint nfirst = (update->ntimestep/nevery)*nevery + nevery;
+    const bigint nfirst = (update->ntimestep / nevery) * nevery + nevery;
    modify->addstep_compute_all(nfirst);
  }
 }
@ -133,8 +134,8 @@ FixHalt::FixHalt(LAMMPS *lmp, int narg, char **arg) :

 FixHalt::~FixHalt()
 {
-  delete [] idvar;
-  delete [] dlimit_path;
+  delete[] idvar;
+  delete[] dlimit_path;
 }

 /* ---------------------------------------------------------------------- */
@ -156,22 +157,22 @@ void FixHalt::init()

  if (attribute == VARIABLE) {
    ivar = input->variable->find(idvar);
-    if (ivar < 0) error->all(FLERR,"Could not find fix halt variable name");
+    if (ivar < 0) error->all(FLERR, "Could not find fix halt variable {}", idvar);
    if (input->variable->equalstyle(ivar) == 0)
-      error->all(FLERR,"Fix halt variable is not equal-style variable");
+      error->all(FLERR, "Fix halt variable {} is not equal-style variable", idvar);
  }

  // settings used by TLIMIT

-  nextstep = (update->ntimestep/nevery)*nevery + nevery;
+  nextstep = (update->ntimestep / nevery) * nevery + nevery;
  thisstep = -1;
  tratio = 0.5;

  // check if disk limit is supported

  if (attribute == DISKFREE) {
-    if (diskfree() < 0.0)
-      error->all(FLERR,"Disk limit not supported by OS or illegal path");
+    if (!dlimit_path || platform::disk_free(dlimit_path) < 0.0)
+      error->all(FLERR, "Disk limit not supported by OS or illegal path");
  }
 }

@ -196,7 +197,7 @@ void FixHalt::end_of_step()
    if (update->ntimestep != nextstep) return;
    attvalue = tlimit();
  } else if (attribute == DISKFREE) {
-    attvalue = diskfree();
+    attvalue = platform::disk_free(dlimit_path) / 1048576.0;    // MBytes
  } else if (attribute == BONDMAX) {
    attvalue = bondmax();
  } else {
@ -205,6 +206,10 @@ void FixHalt::end_of_step()
    modify->addstep_compute(update->ntimestep + nevery);
  }

+  // ensure that the attribute is *exactly* the same on all ranks
+
+  MPI_Bcast(&attvalue, 1, MPI_DOUBLE, 0, world);
+
  // check if halt is triggered, else just return

  if (operation == LT) {
@ -220,21 +225,19 @@ void FixHalt::end_of_step()
  } else if (operation == NEQ) {
    if (attvalue == value) return;
  } else if (operation == XOR) {
-    if ((attvalue == 0.0 && value == 0.0) ||
-        (attvalue != 0.0 && value != 0.0)) return;
+    if ((attvalue == 0.0 && value == 0.0) || (attvalue != 0.0 && value != 0.0)) return;
  }

  // hard halt -> exit LAMMPS
  // soft/continue halt -> trigger timer to break from run loop
  // print message with ID of fix halt in case multiple instances

-  std::string message = fmt::format("Fix halt condition for fix-id {} met on "
-                                    "step {} with value {}",
+  std::string message = fmt::format("Fix halt condition for fix-id {} met on step {} with value {}",
                                    id, update->ntimestep, attvalue);
  if (eflag == HARD) {
-    error->all(FLERR,message);
-  } else if (eflag == SOFT || eflag == CONTINUE) {
-    if (comm->me == 0 && msgflag == YESMSG) error->message(FLERR,message);
+    error->all(FLERR, message);
+  } else if ((eflag == SOFT) || (eflag == CONTINUE)) {
+    if ((comm->me == 0) && (msgflag == YESMSG)) error->message(FLERR, message);
    timer->force_timeout();
  }
 }
@ -260,8 +263,8 @@ double FixHalt::bondmax()
  int **bondlist = neighbor->bondlist;
  int nbondlist = neighbor->nbondlist;

-  int i1,i2;
-  double delx,dely,delz,rsq;
+  int i1, i2;
+  double delx, dely, delz, rsq;
  double maxone = 0.0;

  for (int n = 0; n < nbondlist; n++) {
@ -272,12 +275,12 @@ double FixHalt::bondmax()
    dely = x[i1][1] - x[i2][1];
    delz = x[i1][2] - x[i2][2];

-    rsq = delx*delx + dely*dely + delz*delz;
-    maxone = MAX(rsq,maxone);
+    rsq = delx * delx + dely * dely + delz * delz;
+    maxone = MAX(rsq, maxone);
  }

  double maxall;
-  MPI_Allreduce(&maxone,&maxall,1,MPI_DOUBLE,MPI_MAX,world);
+  MPI_Allreduce(&maxone, &maxall, 1, MPI_DOUBLE, MPI_MAX, world);

  return sqrt(maxall);
 }
@ -291,48 +294,15 @@ double FixHalt::bondmax()
 double FixHalt::tlimit()
 {
  double cpu = timer->elapsed(Timer::TOTAL);
-  MPI_Bcast(&cpu,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&cpu, 1, MPI_DOUBLE, 0, world);

  if (cpu < value) {
    bigint elapsed = update->ntimestep - update->firststep;
-    bigint final = update->firststep +
-      static_cast<bigint> (tratio*value/cpu * elapsed);
-    nextstep = (final/nevery)*nevery + nevery;
+    bigint final = update->firststep + static_cast<bigint>(tratio * value / cpu * elapsed);
+    nextstep = (final / nevery) * nevery + nevery;
    if (nextstep == update->ntimestep) nextstep += nevery;
    tratio = 1.0;
  }

  return cpu;
 }
-
-/* ----------------------------------------------------------------------
-   determine available disk space, if supported. Return -1 if not.
------------------------------------------------------------------------- */
-#if defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__DragonFly__) || defined(__OpenBSD__) || defined(__NetBSD__)
-#include <sys/statvfs.h>
-#endif
-double FixHalt::diskfree()
-{
-#if defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__DragonFly__) || defined(__OpenBSD__) || defined(__NetBSD__)
-  struct statvfs fs;
-  double disk_free = -1.0;
-
-  if (dlimit_path) {
-    disk_free = 1.0e100;
-    int rv = statvfs(dlimit_path,&fs);
-    if (rv == 0) {
-#if defined(__linux__)
-      disk_free = fs.f_bavail*fs.f_bsize/1048576.0;
-#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__DragonFly__) || defined(__OpenBSD__) || defined(__NetBSD__)
-      disk_free = fs.f_bavail*fs.f_frsize/1048576.0;
-#endif
-    } else
-      disk_free = -1.0;
-
-    MPI_Bcast(&disk_free,1,MPI_DOUBLE,0,world);
-  }
-  return disk_free;
-#else
-  return -1.0;
-#endif
-}
--- a/src/grid2d.cpp
+++ b/src/grid2d.cpp
@ -114,7 +114,7 @@ Grid2d::Grid2d(LAMMPS *lmp, MPI_Comm gcomm, int gnx, int gny, int ixlo, int ixhi
  // additional intialization
  // other constructor invokes this from setup_grid()

-  initialize();
+  Grid2d::initialize();
 }

 /* ---------------------------------------------------------------------- */
@ -522,7 +522,7 @@ void Grid2d::ghost_grid()
  // also ensure no other procs use ghost cells beyond +y limit

  if (yextra) {
-    if (layout != Comm::LAYOUT_TILED) {
+    if (comm->layout != Comm::LAYOUT_TILED) {
      if (comm->myloc[1] == comm->procgrid[1]-1) inyhi = outyhi = ny - 1;
    } else {
      if (comm->mysplit[1][1] == 1.0) inyhi = outyhi = ny - 1;
@ -553,15 +553,13 @@ void Grid2d::ghost_grid()

 void Grid2d::extract_comm_info()
 {
-  layout = comm->layout;
-
  // for non TILED layout:
  // proc xyz lohi = my 4 neighbor procs in this MPI_Comm
  //   these proc IDs can be overridden by caller using set_proc_neighs()
  // xyz split = copy of 1d vectors in Comm
  // grid2proc = copy of 3d array in Comm

-  if (layout != Comm::LAYOUT_TILED) {
+  if (comm->layout != Comm::LAYOUT_TILED) {
    procxlo = comm->procneigh[0][0];
    procxhi = comm->procneigh[0][1];
    procylo = comm->procneigh[1][0];
@ -585,7 +583,7 @@ void Grid2d::extract_comm_info()
  // RCBinfo.cut = this proc's inlo in that dim
  // Allgather creates the tree of dims and cuts

-  if (layout == Comm::LAYOUT_TILED) {
+  if (comm->layout == Comm::LAYOUT_TILED) {
    rcbinfo = (RCBinfo *)
      memory->smalloc(nprocs*sizeof(RCBinfo),"grid3d:rcbinfo");
    RCBinfo rcbone;
@ -615,7 +613,7 @@ void Grid2d::extract_comm_info()

 void Grid2d::setup_comm(int &nbuf1, int &nbuf2)
 {
-  if (layout != Comm::LAYOUT_TILED) setup_comm_brick(nbuf1,nbuf2);
+  if (comm->layout != Comm::LAYOUT_TILED) setup_comm_brick(nbuf1,nbuf2);
  else setup_comm_tiled(nbuf1,nbuf2);
 }

@ -1039,7 +1037,7 @@ void Grid2d::setup_comm_tiled(int &nbuf1, int &nbuf2)

 int Grid2d::ghost_adjacent()
 {
-  if (layout != Comm::LAYOUT_TILED) return ghost_adjacent_brick();
+  if (comm->layout != Comm::LAYOUT_TILED) return ghost_adjacent_brick();
  return ghost_adjacent_tiled();
 }

@ -1085,7 +1083,7 @@ int Grid2d::ghost_adjacent_tiled()
 void Grid2d::forward_comm(int caller, void *ptr, int which, int nper, int nbyte,
                          void *buf1, void *buf2, MPI_Datatype datatype)
 {
-  if (layout != Comm::LAYOUT_TILED) {
+  if (comm->layout != Comm::LAYOUT_TILED) {
    if (caller == KSPACE)
      forward_comm_brick<KSpace>((KSpace *) ptr,which,nper,nbyte,
                                 buf1,buf2,datatype);
@ -1190,7 +1188,7 @@ forward_comm_tiled(T *ptr, int which, int nper, int nbyte,
 void Grid2d::reverse_comm(int caller, void *ptr, int which, int nper, int nbyte,
                          void *buf1, void *buf2, MPI_Datatype datatype)
 {
-  if (layout != Comm::LAYOUT_TILED) {
+  if (comm->layout != Comm::LAYOUT_TILED) {
    if (caller == KSPACE)
      reverse_comm_brick<KSpace>((KSpace *) ptr,which,nper,nbyte,
                                 buf1,buf2,datatype);
@ -1314,10 +1312,6 @@ void Grid2d::setup_remap(Grid2d *old, int &nremap_buf1, int &nremap_buf2)

  deallocate_remap();

-  // set layout to current Comm layout
-
-  layout = comm->layout;
-
  // overlaps of my old decomp owned box with all owned boxes in new decomp
  // noverlap_old = # of overlaps, including self
  // overlap_old = vector of overlap info in Overlap data struct
@ -1654,7 +1648,7 @@ int Grid2d::compute_overlap(int ghostflag, int *box, int *pbc, Overlap *&overlap

  // test obox against appropriate layout

-  if (layout != Comm::LAYOUT_TILED) {
+  if (comm->layout != Comm::LAYOUT_TILED) {

    // find comm->procgrid indices in each dim for box bounds

--- a/src/grid2d.h
+++ b/src/grid2d.h
@ -55,7 +55,6 @@ class Grid2d : protected Pointers {

 protected:
  int me, nprocs;
-  int layout;           // not TILED or TILED, same as Comm class
  MPI_Comm gridcomm;    // communicator for this class
                        // usually world, but MSM calls with subset

--- a/src/grid3d.cpp
+++ b/src/grid3d.cpp
@ -123,7 +123,7 @@ Grid3d::Grid3d(LAMMPS *lmp, MPI_Comm gcomm, int gnx, int gny, int gnz,
  // additional intialization
  // other constructor invokes this from setup_grid()

-  initialize();
+  Grid3d::initialize();
 }

 /* ---------------------------------------------------------------------- */
@ -577,7 +577,7 @@ void Grid3d::ghost_grid()
  // also ensure no other procs use ghost cells beyond +z limit

  if (zextra) {
-    if (layout != Comm::LAYOUT_TILED) {
+    if (comm->layout != Comm::LAYOUT_TILED) {
      if (comm->myloc[2] == comm->procgrid[2]-1) inzhi = outzhi = nz - 1;
    } else {
      if (comm->mysplit[2][1] == 1.0) inzhi = outzhi = nz - 1;
@ -613,15 +613,13 @@ void Grid3d::ghost_grid()

 void Grid3d::extract_comm_info()
 {
-  layout = comm->layout;
-
  // for non TILED layout:
  // proc xyz lohi = my 6 neighbor procs in this MPI_Comm
  //   these proc IDs can be overridden by caller using set_proc_neighs()
  // xyz split = copy of 1d vectors in Comm
  // grid2proc = copy of 3d array in Comm

-  if (layout != Comm::LAYOUT_TILED) {
+  if (comm->layout != Comm::LAYOUT_TILED) {
    procxlo = comm->procneigh[0][0];
    procxhi = comm->procneigh[0][1];
    procylo = comm->procneigh[1][0];
@ -649,7 +647,7 @@ void Grid3d::extract_comm_info()
  // RCBinfo.cut = this proc's inlo in that dim
  // Allgather creates the tree of dims and cuts

-  if (layout == Comm::LAYOUT_TILED) {
+  if (comm->layout == Comm::LAYOUT_TILED) {
    rcbinfo = (RCBinfo *)
      memory->smalloc(nprocs*sizeof(RCBinfo),"grid3d:rcbinfo");
    RCBinfo rcbone;
@ -680,7 +678,7 @@ void Grid3d::extract_comm_info()

 void Grid3d::setup_comm(int &nbuf1, int &nbuf2)
 {
-  if (layout != Comm::LAYOUT_TILED) setup_comm_brick(nbuf1,nbuf2);
+  if (comm->layout != Comm::LAYOUT_TILED) setup_comm_brick(nbuf1,nbuf2);
  else setup_comm_tiled(nbuf1,nbuf2);
 }

@ -1207,7 +1205,7 @@ void Grid3d::setup_comm_tiled(int &nbuf1, int &nbuf2)

 int Grid3d::ghost_adjacent()
 {
-  if (layout != Comm::LAYOUT_TILED) return ghost_adjacent_brick();
+  if (comm->layout != Comm::LAYOUT_TILED) return ghost_adjacent_brick();
  return ghost_adjacent_tiled();
 }

@ -1255,7 +1253,7 @@ int Grid3d::ghost_adjacent_tiled()
 void Grid3d::forward_comm(int caller, void *ptr, int which, int nper, int nbyte,
                            void *buf1, void *buf2, MPI_Datatype datatype)
 {
-  if (layout != Comm::LAYOUT_TILED) {
+  if (comm->layout != Comm::LAYOUT_TILED) {
    if (caller == KSPACE)
      forward_comm_brick<KSpace>((KSpace *) ptr,which,nper,nbyte,
                                 buf1,buf2,datatype);
@ -1360,7 +1358,7 @@ forward_comm_tiled(T *ptr, int which, int nper, int nbyte,
 void Grid3d::reverse_comm(int caller, void *ptr, int which, int nper, int nbyte,
                            void *buf1, void *buf2, MPI_Datatype datatype)
 {
-  if (layout != Comm::LAYOUT_TILED) {
+  if (comm->layout != Comm::LAYOUT_TILED) {
    if (caller == KSPACE)
      reverse_comm_brick<KSpace>((KSpace *) ptr,which,nper,nbyte,
                                buf1,buf2,datatype);
@ -1484,10 +1482,6 @@ void Grid3d::setup_remap(Grid3d *old, int &nremap_buf1, int &nremap_buf2)

  deallocate_remap();

-  // set layout to current Comm layout
-
-  layout = comm->layout;
-
  // overlaps of my old decomp owned box with all owned boxes in new decomp
  // noverlap_old = # of overlaps, including self
  // overlap_old = vector of overlap info in Overlap data struct
@ -1829,7 +1823,7 @@ int Grid3d::compute_overlap(int ghostflag, int *box, int *pbc, Overlap *&overlap
    return noverlap_list;
  }

-  if (layout != Comm::LAYOUT_TILED) {
+  if (comm->layout != Comm::LAYOUT_TILED) {

    // find comm->procgrid indices in each dim for box bounds

--- a/src/grid3d.h
+++ b/src/grid3d.h
@ -57,7 +57,6 @@ class Grid3d : protected Pointers {

 protected:
  int me, nprocs;
-  int layout;           // not TILED or TILED, same as Comm class
  MPI_Comm gridcomm;    // communicator for this class
                        // usually world, but MSM calls with subset

--- a/src/library.cpp
+++ b/src/library.cpp
@ -709,7 +709,7 @@ void lammps_commands_string(void *handle, const char *str)
          break;
        }

-        lmp->input->one(cmd.c_str());
+        lmp->input->one(cmd);
      }
    }
  }
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@ -1791,16 +1791,17 @@ void Neighbor::print_pairwise_info()
        out += fmt::format(", trim from ({})",rq->copylist+1);
      else
        out += fmt::format(", copy from ({})",rq->copylist+1);
-    } else if (rq->halffull)
+    } else if (rq->halffull) {
      if (rq->trim)
        out += fmt::format(", half/full trim from ({})",rq->halffulllist+1);
      else
        out += fmt::format(", half/full from ({})",rq->halffulllist+1);
-    else if (rq->skip)
+    } else if (rq->skip) {
      if (rq->trim)
        out += fmt::format(", skip trim from ({})",rq->skiplist+1);
      else
        out += fmt::format(", skip from ({})",rq->skiplist+1);
+    }
    out += "\n";

    // list of neigh list attributes
--- a/src/nstencil_bin.cpp
+++ b/src/nstencil_bin.cpp
@ -55,9 +55,9 @@ void NStencilBin<HALF, DIM_3D, TRI>::create()

        // Now only include "upper right" bins for half and ortho stencils
        if (HALF && (!DIM_3D) && (!TRI))
-          if (! (j > 0 || (j == 0 && i > 0))) continue;
+          if (j <= 0 && (j != 0 || i <= 0)) continue;
        if (HALF && DIM_3D && (!TRI))
-          if (! (k > 0 || j > 0 || (j == 0 && i > 0))) continue;
+          if (k <= 0 && j <= 0 && (j != 0 || i <= 0)) continue;

        if (bin_distance(i, j, k) < cutneighmaxsq)
          stencil[nstencil++] = k * mbiny * mbinx + j * mbinx + i;
--- a/src/nstencil_multi.cpp
+++ b/src/nstencil_multi.cpp
@ -115,9 +115,9 @@ void NStencilMulti<HALF, DIM_3D, TRI>::create()
            if (HALF && (!TRI)) {
              if (half_flag) {
                if (DIM_3D) {
-                  if (! (k > 0 || j > 0 || (j == 0 && i > 0))) continue;
+                  if (k <= 0 && j <= 0 && (j != 0 || i <= 0)) continue;
                } else {
-                  if (! (j > 0 || (j == 0 && i > 0))) continue;
+                  if (j <= 0 && (j != 0 || i <= 0)) continue;
                }
              }
            }
--- a/src/nstencil_multi_old.cpp
+++ b/src/nstencil_multi_old.cpp
@ -65,9 +65,9 @@ void NStencilMultiOld<HALF, DIM_3D, TRI>::create()

          // Now only include "upper right" bins for half and ortho stencils
          if (HALF && (!DIM_3D) && (!TRI))
-            if (! (j > 0 || (j == 0 && i > 0))) continue;
+            if (j <= 0 && (j != 0 || i <= 0)) continue;
          if (HALF && DIM_3D && (!TRI))
-            if (! (k > 0 || j > 0 || (j == 0 && i > 0))) continue;
+            if (k <= 0 && j <= 0 && (j != 0 || i <= 0)) continue;

          rsq = bin_distance(i, j, k);
          if (rsq < typesq) {
--- a/src/platform.cpp
+++ b/src/platform.cpp
@ -61,6 +61,13 @@
 #include <fcntl.h>
 #include <sys/syslimits.h>
 #endif
+
+// for disk_free()
+#if defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__DragonFly__) || \
+    defined(__OpenBSD__) || defined(__NetBSD__)
+#include <sys/statvfs.h>
+#endif
+
 ////////////////////////////////////////////////////////////////////////

 #include <chrono>
@ -1047,6 +1054,36 @@ bool platform::file_is_readable(const std::string &path)
  }
  return false;
 }
+/* ----------------------------------------------------------------------
+   determine available disk space, if supported. Return -1 if not.
+------------------------------------------------------------------------- */
+
+double platform::disk_free(const std::string &path)
+{
+  double bytes_free = -1.0;
+
+#if defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__DragonFly__) || \
+    defined(__OpenBSD__) || defined(__NetBSD__)
+  struct statvfs fs;
+
+  if (path.size()) {
+    int rv = statvfs(path.c_str(), &fs);
+    if (rv == 0) {
+#if defined(__linux__)
+      bytes_free = fs.f_bavail * fs.f_bsize;
+#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__DragonFly__) || \
+    defined(__OpenBSD__) || defined(__NetBSD__)
+      bytes_free = fs.f_bavail * fs.f_frsize;
+#endif
+    }
+  }
+#elif defined(_WIN32)
+  uint64_t is_free = 0;
+  if (GetDiskFreeSpaceEx(path.c_str(), (PULARGE_INTEGER) &is_free, nullptr, nullptr))
+    bytes_free = is_free;
+#endif
+  return bytes_free;
+}

 /* ----------------------------------------------------------------------
   check if filename has a known compression extension
--- a/src/platform.h
+++ b/src/platform.h
@ -377,6 +377,15 @@ namespace platform {

  bool file_is_readable(const std::string &path);

+  /*! Return free disk space in bytes of file system pointed to by path
+   *
+   * Returns -1.0 if the path is invalid or free space reporting not supported.
+   *
+   * \param path file or folder path in file system
+   * \return  */
+
+  double disk_free(const std::string &path);
+
  /*! Check if a file name ends in a known extension for a compressed file format
   *
   * Currently supported file extensions are: .gz, .bz2, .zst, .xz, .lzma, lz4
--- a/src/potential_file_reader.cpp
+++ b/src/potential_file_reader.cpp
@ -144,6 +144,8 @@ void PotentialFileReader::next_dvector(double *list, int n)
 {
  try {
    return reader->next_dvector(list, n);
+  } catch (EOFException &) {
+    throw EOFException("EOF reached");
  } catch (FileReaderException &e) {
    error->one(FLERR, e.what());
  }
--- a/src/text_file_reader.cpp
+++ b/src/text_file_reader.cpp
@ -189,8 +189,9 @@ void TextFileReader::next_dvector(double *list, int n)
    char *ptr = next_line();

    if (ptr == nullptr) {
-      // EOF
-      if (i < n) {
+      if (i == 0) { // EOF without any records
+        throw EOFException("EOF reached");
+      } else if (i < n) { // EOF with incomplete data
        throw FileReaderException(
            fmt::format("Incorrect format in {} file! {}/{} values", filetype, i, n));
      }
--- a/tools/lammps-gui/CMakeLists.txt
+++ b/tools/lammps-gui/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.16)

-project(lammps-gui VERSION 1.5.10 LANGUAGES CXX)
+project(lammps-gui VERSION 1.5.11 LANGUAGES CXX)

 set(CMAKE_AUTOUIC ON)
 set(CMAKE_AUTOMOC ON)
--- a/tools/lammps-gui/help_index.table
+++ b/tools/lammps-gui/help_index.table
@ -254,6 +254,7 @@ compute_nbond_atom.html compute nbond/atom
 compute_omega_chunk.html compute omega/chunk
 compute_orientorder_atom.html compute orientorder/atom
 compute_orientorder_atom.html compute orientorder/atom/kk
+compute_pace.html compute pace
 compute_pair_local.html compute pair/local
 compute_pair.html compute pair
 compute_pe_atom.html compute pe/atom
@ -267,13 +268,17 @@ compute_property_chunk.html compute property/chunk
 compute_property_grid.html compute property/grid
 compute_property_local.html compute property/local
 compute_ptm_atom.html compute ptm/atom
+compute_rattlers_atom.html compute rattlers/atom
 compute_rdf.html compute rdf
+compute_reaxff_atom.html compute reaxff/atom
+compute_reaxff_atom.html compute reaxff/atom/kk
 compute_reduce_chunk.html compute reduce/chunk
 compute_reduce.html compute reduce
 compute_reduce.html compute reduce/region
 compute_rigid_local.html compute rigid/local
 compute.html compute
 compute_saed.html compute saed
+compute_slcsa_atom.html compute slcsa/atom
 compute_slice.html compute slice
 compute_smd_contact_radius.html compute smd/contact/radius
 compute_smd_damage.html compute smd/damage
@ -484,6 +489,7 @@ fix_drude_transform.html fix drude/transform/inverse
 fix_dt_reset.html fix dt/reset
 fix_dt_reset.html fix dt/reset/kk
 fix_efield.html fix efield
+fix_efield.html fix efield/kk
 fix_efield.html fix efield/tip4p
 fix_ehex.html fix ehex
 fix_electrode.html fix electrode/conp
@ -565,6 +571,7 @@ fix_nh.html fix nvt/kk
 fix_nh.html fix nvt/omp
 fix_nh_uef.html fix npt/uef
 fix_nh_uef.html fix nvt/uef
+fix_nonaffine_displacement.html fix nonaffine/displacement
 fix_nph_asphere.html fix nph/asphere
 fix_nph_asphere.html fix nph/asphere/omp
 fix_nph_body.html fix nph/body
@ -634,6 +641,7 @@ fix_polarize.html fix polarize/functional
 fix_pour.html fix pour
 fix_precession_spin.html fix precession/spin
 fix_press_berendsen.html fix press/berendsen
+fix_press_langevin.html fix press/langevin
 fix_print.html fix print
 fix_propel_self.html fix propel/self
 fix_property_atom.html fix property/atom
@ -703,14 +711,17 @@ fix_spring_chunk.html fix spring/chunk
 fix_spring_rg.html fix spring/rg
 fix_spring.html fix spring
 fix_spring_self.html fix spring/self
+fix_spring_self.html fix spring/self/kk
 fix_srd.html fix srd
 fix_store_force.html fix store/force
 fix_store_state.html fix store/state
 fix_temp_berendsen.html fix temp/berendsen
+fix_temp_berendsen.html fix temp/berendsen/kk
 fix_temp_csvr.html fix temp/csld
 fix_temp_csvr.html fix temp/csvr
 fix_temp_rescale_eff.html fix temp/rescale/eff
 fix_temp_rescale.html fix temp/rescale
+fix_temp_rescale.html fix temp/rescale/kk
 fix_tfmc.html fix tfmc
 fix_tgnh_drude.html fix tgnpt/drude
 fix_tgnh_drude.html fix tgnvt/drude
@ -980,6 +991,7 @@ pair_coul_shield.html pair_style coul/shield
 pair_coul_slater.html pair_style coul/slater
 pair_coul_slater.html pair_style coul/slater/cut
 pair_coul_slater.html pair_style coul/slater/long
+pair_coul_slater.html pair_style coul/slater/long/gpu
 pair_coul_tt.html pair_style coul/tt
 pair_cs.html pair_style born/coul/dsf/cs
 pair_cs.html pair_style born/coul/long/cs
@ -1073,8 +1085,10 @@ pair_fep_soft.html pair_style lj/class2/coul/cut/soft
 pair_fep_soft.html pair_style lj/class2/coul/long/soft
 pair_fep_soft.html pair_style lj/class2/soft
 pair_fep_soft.html pair_style lj/cut/coul/cut/soft
+pair_fep_soft.html pair_style lj/cut/coul/cut/soft/gpu
 pair_fep_soft.html pair_style lj/cut/coul/cut/soft/omp
 pair_fep_soft.html pair_style lj/cut/coul/long/soft
+pair_fep_soft.html pair_style lj/cut/coul/long/soft/gpu
 pair_fep_soft.html pair_style lj/cut/coul/long/soft/omp
 pair_fep_soft.html pair_style lj/cut/soft
 pair_fep_soft.html pair_style lj/cut/soft/omp
@ -1225,7 +1239,9 @@ pair_meam_sw_spline.html pair_style meam/sw/spline
 pair_mesocnt.html pair_style mesocnt
 pair_mesocnt.html pair_style mesocnt/viscous
 pair_mesodpd.html pair_style edpd
+pair_mesodpd.html pair_style edpd/gpu
 pair_mesodpd.html pair_style mdpd
+pair_mesodpd.html pair_style mdpd/gpu
 pair_mesodpd.html pair_style mdpd/rhosum
 pair_mesodpd.html pair_style tdpd
 pair_mgpt.html pair_style mgpt
@ -1245,7 +1261,8 @@ pair_morse.html pair_style morse/smooth/linear/omp
 pair_multi_lucy.html pair_style multi/lucy
 pair_multi_lucy_rx.html pair_style multi/lucy/rx
 pair_multi_lucy_rx.html pair_style multi/lucy/rx/kk
-pair_nb3b_harmonic.html pair_style nb3b/harmonic
+pair_nb3b.html pair_style nb3b/harmonic
+pair_nb3b.html pair_style nb3b/screened
 pair_nm.html pair_style nm/cut
 pair_nm.html pair_style nm/cut/coul/cut
 pair_nm.html pair_style nm/cut/coul/cut/omp
@ -1303,16 +1320,20 @@ pair_smd_triangulated_surface.html pair_style smd/tri_surface
 pair_smd_ulsph.html pair_style smd/ulsph
 pair_smtbq.html pair_style smtbq
 pair_snap.html pair_style snap
+pair_snap.html pair_style snap/intel
 pair_snap.html pair_style snap/kk
 pair_soft.html pair_style soft
 pair_soft.html pair_style soft/gpu
 pair_soft.html pair_style soft/omp
 pair_sph_heatconduction.html pair_style sph/heatconduction
+pair_sph_heatconduction.html pair_style sph/heatconduction/gpu
 pair_sph_idealgas.html pair_style sph/idealgas
 pair_sph_lj.html pair_style sph/lj
+pair_sph_lj.html pair_style sph/lj/gpu
 pair_sph_rhosum.html pair_style sph/rhosum
 pair_sph_taitwater_morris.html pair_style sph/taitwater/morris
 pair_sph_taitwater.html pair_style sph/taitwater
+pair_sph_taitwater.html pair_style sph/taitwater/gpu
 pair_spica.html pair_style lj/spica
 pair_spica.html pair_style lj/spica/coul/long
 pair_spica.html pair_style lj/spica/coul/long/gpu
@ -1384,6 +1405,7 @@ pair_write.html pair_write
 pair_ylz.html pair_style ylz
 pair_yukawa_colloid.html pair_style yukawa/colloid
 pair_yukawa_colloid.html pair_style yukawa/colloid/gpu
+pair_yukawa_colloid.html pair_style yukawa/colloid/kk
 pair_yukawa_colloid.html pair_style yukawa/colloid/omp
 pair_yukawa.html pair_style yukawa
 pair_yukawa.html pair_style yukawa/gpu
--- a/tools/lammps-gui/main.cpp
+++ b/tools/lammps-gui/main.cpp
@ -36,7 +36,7 @@ int main(int argc, char *argv[])

    LammpsGui w(nullptr, infile);
    w.show();
-    return a.exec();
+    return QApplication::exec();
 }

 // Local Variables:
--- a/tools/lammps-gui/preferences.cpp
+++ b/tools/lammps-gui/preferences.cpp
@ -177,7 +177,7 @@ void Preferences::accept()
        msg.exec();
        const char *path = mystrdup(QCoreApplication::applicationFilePath());
        const char *arg0 = mystrdup(QCoreApplication::arguments().at(0));
-        execl(path, arg0, (char *)NULL);
+        execl(path, arg0, (char *)nullptr);
    }

    // reformatting settings
--- a/tools/lammps-gui/stdcapture.cpp
+++ b/tools/lammps-gui/stdcapture.cpp
@ -38,7 +38,7 @@
 StdCapture::StdCapture() : m_oldStdOut(0), m_capturing(false)
 {
    // make stdout unbuffered so that we don't need to flush the stream
-    setvbuf(stdout, NULL, _IONBF, 0);
+    setvbuf(stdout, nullptr, _IONBF, 0);

    m_pipe[READ]  = 0;
    m_pipe[WRITE] = 0;
@ -106,7 +106,7 @@ bool StdCapture::EndCapture()

 std::string StdCapture::GetChunk()
 {
-    if (!m_capturing) return std::string();
+    if (!m_capturing) return {};
    int bytesRead = 0;
    buf[0]        = '\0';

@ -120,7 +120,7 @@ std::string StdCapture::GetChunk()
    if (bytesRead > 0) {
        buf[bytesRead] = '\0';
    }
-    return std::string(buf);
+    return {buf};
 }

 std::string StdCapture::GetCapture()
--- a/tools/msi2lmp/src/SearchAndFill.c
+++ b/tools/msi2lmp/src/SearchAndFill.c
@ -116,7 +116,7 @@ void SearchAndFill(struct FrcFieldItem *item)
  /* Read lines until keyword is found */

  if (fseek(FrcF,file_pos,SEEK_SET) < 0) {
-    fprintf(stderr, "Resetting file stream failed: ", strerror(errno));
+    fprintf(stderr, "Resetting file stream failed: %s\n", strerror(errno));
    exit(2);
  }
  strcpy(line,"empty");
--- a/unittest/commands/test_groups.cpp
+++ b/unittest/commands/test_groups.cpp
@ -314,7 +314,7 @@ TEST_F(GroupTest, Dynamic)
                 command("group ramp variable grow"););
 }

-constexpr double EPSILON = 1.0e-14;
+constexpr double EPSILON = 1.0e-13;

 TEST_F(GroupTest, VariableFunctions)
 {
--- a/unittest/force-styles/tests/mol-pair-lj_cut_coul_cut_soft.yaml
+++ b/unittest/force-styles/tests/mol-pair-lj_cut_coul_cut_soft.yaml
@ -1,7 +1,7 @@
 ---
 lammps_version: 17 Feb 2022
 date_generated: Fri Mar 18 22:17:31 2022
-epsilon: 2e-13
+epsilon: 5e-13
 skip_tests:
 prerequisites: ! |
  atom full
--- a/unittest/fortran/wrap_configuration.cpp
+++ b/unittest/fortran/wrap_configuration.cpp
@ -234,7 +234,7 @@ TEST_F(LAMMPS_configuration, style_count)
 {
    Info info(lmp);
    for (const auto &c : style_category)
-        EXPECT_EQ(f_lammps_style_count(c.c_str()), info.get_available_styles(c.c_str()).size());
+        EXPECT_EQ(f_lammps_style_count(c.c_str()), info.get_available_styles(c).size());
 };

 TEST_F(LAMMPS_configuration, style_name)
--- a/unittest/utils/test_lepton.cpp
+++ b/unittest/utils/test_lepton.cpp
@ -129,9 +129,9 @@ TEST(LeptonCustomFunction, zbl)
 */

 class ExampleFunction : public Lepton::CustomFunction {
-    int getNumArguments() const { return 2; }
-    double evaluate(const double *arguments) const { return 2.0 * arguments[0] * arguments[1]; }
-    double evaluateDerivative(const double *arguments, const int *derivOrder) const
+    int getNumArguments() const override { return 2; }
+    double evaluate(const double *arguments) const override { return 2.0 * arguments[0] * arguments[1]; }
+    double evaluateDerivative(const double *arguments, const int *derivOrder) const override
    {
        if (derivOrder[0] == 1) {
            if (derivOrder[1] == 0)
@ -142,7 +142,7 @@ class ExampleFunction : public Lepton::CustomFunction {
        if (derivOrder[1] == 1 && derivOrder[0] == 0) return 2.0 * arguments[0];
        return 0.0;
    }
-    Lepton::CustomFunction *clone() const { return new ExampleFunction(); }
+    Lepton::CustomFunction *clone() const override { return new ExampleFunction(); }
 };

 /**