Merge branch 'develop' of https://github.com/lammps/lammps into develop

2024-01-16 23:36:24 +10:00
parent b95706ca64 8076d89422
commit 90b682834f
483 changed files with 11220 additions and 21133 deletions
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@ -35,7 +35,7 @@ jobs:
        python-version: '3.x'

    - name: Initialize CodeQL
-      uses: github/codeql-action/init@v2
+      uses: github/codeql-action/init@v3
      with:
        languages: ${{ matrix.language }}
        config-file: ./.github/codeql/${{ matrix.language }}.yml
@ -55,4 +55,4 @@ jobs:
        cmake --build . --parallel 2

    - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v2
+      uses: github/codeql-action/analyze@v3
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -662,9 +662,14 @@ foreach(PKG_WITH_INCL CORESHELL DPD-SMOOTH MC MISC PHONON QEQ OPENMP KOKKOS OPT
  endif()
 endforeach()

-if(PKG_PLUGIN)
-  target_compile_definitions(lammps PRIVATE -DLMP_PLUGIN)
-endif()
+######################################################################
+# packages with defines to disable package specific code
+######################################################################
+foreach(PKG_WITH_DEF  BPM PLUGIN)
+  if(PKG_${PKG_WITH_DEF})
+    target_compile_definitions(lammps PRIVATE -DLMP_${PKG_WITH_DEF})
+  endif()
+endforeach()

 # link with -ldl or equivalent for plugin loading; except on Windows
 if(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@ -16,11 +16,6 @@ endif()
 if(Kokkos_ENABLE_OPENMP)
  if(NOT BUILD_OMP)
    message(FATAL_ERROR "Must enable BUILD_OMP with Kokkos_ENABLE_OPENMP")
-  else()
-    # NVHPC/(AMD)Clang does not seem to provide a detectable OpenMP version, but is far beyond version 3.1
-    if((OpenMP_CXX_VERSION VERSION_LESS 3.1) AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "NVHPC") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
-      message(FATAL_ERROR "Compiler must support OpenMP 3.1 or later with Kokkos_ENABLE_OPENMP")
-    endif()
  endif()
 endif()
 ########################################################################
--- a/cmake/presets/oneapi.cmake
+++ b/cmake/presets/oneapi.cmake
@ -18,11 +18,11 @@ set(MPI_CXX_COMPILER "mpicxx" CACHE STRING "" FORCE)

 unset(HAVE_OMP_H_INCLUDE CACHE)
 set(OpenMP_C "icx" CACHE STRING "" FORCE)
-set(OpenMP_C_FLAGS "-qopenmp -qopenmp-simd" CACHE STRING "" FORCE)
+set(OpenMP_C_FLAGS "-qopenmp;-qopenmp-simd" CACHE STRING "" FORCE)
 set(OpenMP_C_LIB_NAMES "omp" CACHE STRING "" FORCE)
 set(OpenMP_CXX "icpx" CACHE STRING "" FORCE)
-set(OpenMP_CXX_FLAGS "-qopenmp -qopenmp-simd" CACHE STRING "" FORCE)
+set(OpenMP_CXX_FLAGS "-qopenmp;-qopenmp-simd" CACHE STRING "" FORCE)
 set(OpenMP_CXX_LIB_NAMES "omp" CACHE STRING "" FORCE)
-set(OpenMP_Fortran_FLAGS "-qopenmp -qopenmp-simd" CACHE STRING "" FORCE)
+set(OpenMP_Fortran_FLAGS "-qopenmp;-qopenmp-simd" CACHE STRING "" FORCE)
 set(OpenMP_omp_LIBRARY "libiomp5.so" CACHE PATH "" FORCE)

--- a/doc/github-development-workflow.md
+++ b/doc/github-development-workflow.md
@ -36,10 +36,10 @@ requests.
 MUST be submitted as a pull request to GitHub.  All changes to the
 "develop" branch must be made exclusively through merging pull requests.
 The "release" and "stable" branches, respectively, are only to be
-updated upon feature or stable releases based on the associated
-tags.  Updates to the stable release in between stable releases
+updated upon "feature releases" or "stable releases" based on the
+associated tags.  Updates to the stable release in between stable releases
 (for example, back-ported bug fixes) are first merged into the "maintenance"
-branch and then into the "stable" branch as update releases.
+branch and then into the "stable" branch as "stable update releases".

 Pull requests may also be submitted to (long-running) feature branches
 created by LAMMPS developers inside the LAMMPS project, if needed. Those
@ -131,7 +131,7 @@ testing -- that the code in the branch "develop" does not get easily
 broken.  These tests are run after every update to a pull request.  More
 extensive and time-consuming tests (including regression testing) are
 performed after code is merged to the "develop" branch.  There are feature
-releases of LAMMPS made about every 4-6 weeks at a point, when the LAMMPS
+releases of LAMMPS made about every 4-8 weeks at a point, when the LAMMPS
 developers feel, that a sufficient number of changes have been included
 and all post-merge testing has been successful.  These feature releases are
 marked with a `patch_<version date>` tag and the "release" branch
--- a/doc/graphviz/Makefile
+++ b/doc/graphviz/Makefile
@ -16,8 +16,11 @@ clean:
 	rm -f $(IMGSVG) $(IMGPDF) $(IMGPNG) *~

 ifeq ($(HAS_DOT),YES)
-$(IMGDIR)/%.png: %.dot
+$(IMGDIR)/lammps-classes.png : lammps-classes.dot
 	dot -Tpng -Kneato -o $@ $<
+
+$(IMGDIR)/%.png: %.dot
+	dot -Tpng -Kdot -o $@ $<
 endif

 ifeq ($(HAS_DOT),NO)
--- a/doc/graphviz/lammps-releases.dot
+++ b/doc/graphviz/lammps-releases.dot
@ -0,0 +1,34 @@
+// LAMMPS branches and releases
+digraph releases {
+    rankdir="LR";
+    github [shape="box" label="Pull Requests\non GitHub" height=0.75];
+    github -> develop [label="Merge commits"];
+    {
+        rank = "same";
+        work [shape="none" label="Development branches:"]
+        develop [label="'develop' branch" height=0.75];
+        maintenance [label="'maintenance' branch" height=0.75];
+    };
+    {
+        rank = "same";
+        upload [shape="none" label="Release branches:"]
+        release [label="'release' branch" height=0.75];
+        stable [label="'stable' branch" height=0.75];
+    };
+    develop -> release [label="Feature release\n(every 4-8 weeks)"];
+    release -> stable [label="Stable release\n(once per year)"];
+    stable -> maintenance [label="Reset on stable release" style="setlinewidth(2)"];
+    develop -> maintenance [label="Backports of bugfixes" style="dashed"];
+    maintenance -> stable [label="Updates to stable release"];
+    {
+        rank = "same";
+        tag [shape="none" label="Applied tags:"];
+        patchtag [shape="box" label="patch_<date>"];
+        stabletag [shape="box" label="stable_<date>"];
+        updatetag [shape="box" label="stable_<date>_update<num>"];
+    };
+    release -> patchtag [label="feature release" style="dotted"];
+    stable -> stabletag [label="stable release" style="dotted"];
+    stable -> updatetag [label="update release" style="dotted"];
+}
+
--- a/doc/src/Build_development.rst
+++ b/doc/src/Build_development.rst
@ -255,16 +255,18 @@ A test run is then a a collection multiple individual test runs each
 with many comparisons to reference results based on template input
 files, individual command settings, relative error margins, and
 reference data stored in a YAML format file with ``.yaml``
-suffix. Currently the programs ``test_pair_style``, ``test_bond_style``, and
-``test_angle_style`` are implemented.  They will compare forces, energies and
-(global) stress for all atoms after a ``run 0`` calculation and after a
-few steps of MD with :doc:`fix nve <fix_nve>`, each in multiple variants
-with different settings and also for multiple accelerated styles. If a
-prerequisite style or package is missing, the individual tests are
-skipped.  All tests will be executed on a single MPI process, so using
-the CMake option ``-D BUILD_MPI=off`` can significantly speed up testing,
-since this will skip the MPI initialization for each test run.
-Below is an example command and output:
+suffix. Currently the programs ``test_pair_style``, ``test_bond_style``,
+``test_angle_style``, ``test_dihedral_style``, and
+``test_improper_style`` are implemented.  They will compare forces,
+energies and (global) stress for all atoms after a ``run 0`` calculation
+and after a few steps of MD with :doc:`fix nve <fix_nve>`, each in
+multiple variants with different settings and also for multiple
+accelerated styles. If a prerequisite style or package is missing, the
+individual tests are skipped.  All force style tests will be executed on
+a single MPI process, so using the CMake option ``-D BUILD_MPI=off`` can
+significantly speed up testing, since this will skip the MPI
+initialization for each test run.  Below is an example command and
+output:

 .. code-block:: console

@ -416,15 +418,16 @@ When compiling LAMMPS with enabled tests, most test executables will
 need to be linked against the LAMMPS library.  Since this can be a very
 large library with many C++ objects when many packages are enabled, link
 times can become very long on machines that use the GNU BFD linker (e.g.
-Linux systems).  Alternatives like the ``lld`` linker of the LLVM project
-or the ``gold`` linker available with GNU binutils can speed up this step
-substantially. CMake will by default test if any of the two can be
-enabled and use it when ``ENABLE_TESTING`` is active.  It can also be
-selected manually through the ``CMAKE_CUSTOM_LINKER`` CMake variable.
-Allowed values are ``lld``, ``gold``, ``bfd``, or ``default``.  The
-``default`` option will use the system default linker otherwise, the
-linker is chosen explicitly.  This option is only available for the
-GNU or Clang C++ compiler.
+Linux systems).  Alternatives like the ``mold`` linker, the ``lld``
+linker of the LLVM project, or the ``gold`` linker available with GNU
+binutils can speed up this step substantially (in this order).  CMake
+will by default test if any of the three can be enabled and use it when
+``ENABLE_TESTING`` is active.  It can also be selected manually through
+the ``CMAKE_CUSTOM_LINKER`` CMake variable.  Allowed values are
+``mold``, ``lld``, ``gold``, ``bfd``, or ``default``.  The ``default``
+option will use the system default linker otherwise, the linker is
+chosen explicitly.  This option is only available for the GNU or Clang
+C++ compilers.

 Tests for other components and utility functions
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
--- a/doc/src/Commands_compute.rst
+++ b/doc/src/Commands_compute.rst
@ -118,6 +118,7 @@ KOKKOS, o = OPENMP, t = OPT.
   * :doc:`ptm/atom <compute_ptm_atom>`
   * :doc:`rattlers/atom <compute_rattlers_atom>`
   * :doc:`rdf <compute_rdf>`
+   * :doc:`reaxff/atom (k) <compute_reaxff_atom>`
   * :doc:`reduce <compute_reduce>`
   * :doc:`reduce/chunk <compute_reduce_chunk>`
   * :doc:`reduce/region <compute_reduce>`
--- a/doc/src/Developer_platform.rst
+++ b/doc/src/Developer_platform.rst
@ -70,6 +70,9 @@ File and path functions and global constants
 .. doxygenfunction:: is_console
   :project: progguide

+.. doxygenfunction:: disk_free
+   :project: progguide
+
 .. doxygenfunction:: path_is_directory
   :project: progguide

--- a/doc/src/Developer_unittest.rst
+++ b/doc/src/Developer_unittest.rst
@ -121,7 +121,7 @@ will be suppressed and only a summary printed, but adding
 the '-V' option will then produce output from the tests
 above like the following:

-.. code-block::
+.. code-block:: console

   [...]
   1: [ RUN      ] Tokenizer.empty_string
@ -526,3 +526,102 @@ The ``unittest/tools`` folder contains tests for programs in the
 shell, which are implemented as a python scripts using the ``unittest``
 Python module and launching the tool commands through the ``subprocess``
 Python module.
+
+
+Troubleshooting failed unit tests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The are by default no unit tests for newly added features (e.g. pair, fix,
+or compute styles) unless your pull request also includes tests for the
+added features.  If you are modifying some features, you may see failures
+for existing tests, if your modifications have some unexpected side effects
+or your changes render the existing text invalid.  If you are adding an
+accelerated version of an existing style, then only tests for INTEL,
+KOKKOS (with OpenMP only), OPENMP, and OPT will be run automatically.
+Tests for the GPU package are time consuming and thus are only run
+*after* a merge, or when a special label, ``gpu_unit_tests`` is added
+to the pull request.  After the test has started, it is often best to
+remove the label since every PR activity will re-trigger the test (that
+is a limitation of triggering a test with a label).  Support for unit
+tests with using KOKKOS with GPU acceleration is currently not supported.
+
+When you see a failed build on GitHub, click on ``Details`` to be taken
+to the corresponding LAMMPS Jenkins CI web page.  Click on the "Exit"
+symbol near the ``Logout`` button on the top right of that page to go to
+the "classic view".  In the classic view, there is a list of the
+individual runs that make up this test run (they are shown but cannot be
+inspected in the default view).  You can click on any of those.
+Clicking on ``Test Result`` will display the list of failed tests. Click
+on the "Status" column to sort the tests based on their Failed or Passed
+status.  Then click on the failed test to expand its output.
+
+For example, the following output snippet shows the failed unit test
+
+.. code-block:: console
+
+   [ RUN      ] PairStyle.gpu
+   /home/builder/workspace/dev/pull_requests/ubuntu_gpu/unit_tests/cmake_gpu_opencl_mixed_smallbig_clang_static/unittest/force-styles/test_main.cpp:63: Failure
+   Expected: (err) <= (epsilon)
+   Actual: 0.00018957912910606503 vs 0.0001
+   Google Test trace:
+   /home/builder/workspace/dev/pull_requests/ubuntu_gpu/unit_tests/cmake_gpu_opencl_mixed_smallbig_clang_static/unittest/force-styles/test_main.cpp:56: EXPECT_FORCES: init_forces (newton off)
+   /home/builder/workspace/dev/pull_requests/ubuntu_gpu/unit_tests/cmake_gpu_opencl_mixed_smallbig_clang_static/unittest/force-styles/test_main.cpp:64: Failure
+   Expected: (err) <= (epsilon)
+   Actual: 0.00022892713393549854 vs 0.0001
+
+The failed assertions provide line numbers in the test source
+(e.g. ``test_main.cpp:56``), from which one can understand what
+specific assertion failed.
+
+Note that the force style engine runs one of a small number of systems
+in a rather off-equilibrium configuration with a few atoms for a few
+steps, writes data and restart files, uses :doc:`the clear command
+<clear>` to reset LAMMPS, and then runs from those files with different
+settings (e.g. newton on/off) and integrators (e.g. verlet vs. respa).
+Beyond potential issues/bugs in the source code, the mismatch between
+the expected and actual values could be that force arrays are not
+properly cleared between multiple run commands or that class members are
+not correctly initialized or written to or read from a data or restart
+file.
+
+While the epsilon (relative precision) for a single, `IEEE 754 compliant
+<https://en.wikipedia.org/wiki/IEEE_754>`_, double precision floating
+point operation is at about 2.2e-16, the achievable precision for the
+tests is lower due to most numbers being sums over intermediate results
+and the non-associativity of floating point math leading to larger
+errors.  In some cases specific properties of the tested style.  As a
+rule of thumb, the test epsilon can often be in the range 5.0e-14 to
+1.0e-13.  But for "noisy" force kernels, e.g. those a larger amount of
+arithmetic operations involving `exp()`, `log()` or `sin()` functions,
+and also due to the effect of compiler optimization or differences
+between compilers or platforms, epsilon may need to be further relaxed,
+sometimes epsilon can be relaxed to 1.0e-12. If interpolation or lookup
+tables are used, epsilon may need to be set to 1.0e-10 or even higher.
+For tests of accelerated styles, the per-test epsilon is multiplied
+by empirical factors that take into account the differences in the order
+of floating point operations or that some or most intermediate operations
+may be done using approximations or with single precision floating point
+math.
+
+To rerun the failed unit test individually, change to the ``build`` directory
+and run the test with verbose output. For example,
+
+.. code-block:: bash
+
+    env TEST_ARGS=-v ctest -R ^MolPairStyle:lj_cut_coul_long -V
+
+``ctest`` with the ``-V`` flag also shows the exact command line
+of the test. One can then use ``gdb --args`` to further debug and
+catch exceptions with the test command, for example,
+
+.. code-block:: bash
+
+    gdb --args /path/to/lammps/build/test_pair_style /path/to/lammps/unittest/force-styles/tests/mol-pair-lj_cut_coul_long.yaml
+
+
+It is recommended to configure the build with ``-D
+BUILD_SHARED_LIBS=on`` and use a custom linker to shorten the build time
+during recompilation.  Installing `ccache` in your development
+environment helps speed up recompilation by caching previous
+compilations and detecting when the same compilation is being done
+again.  Please see :doc:`Build_development` for further details.
--- a/doc/src/Howto_github.rst
+++ b/doc/src/Howto_github.rst
@ -480,11 +480,11 @@ Some recent changes to the workflow are not captured in this tutorial.
 For example, in addition to the *develop* branch, to which all new
 features should be submitted, there is also a *release*, a *stable*, and
 a *maintenance* branch; the *release* branch is updated from the
-*develop* as part of a feature release, and *stable* (together with
-*release*) are updated from *develop* when a stable release is made. In
-between stable releases, selected bug fixes and infrastructure updates
-are back-ported from the *develop* branch to the *maintenance* branch
-and occasionally merged to *stable* as an update release.
+*develop* branch as part of a "feature release", and *stable* (together
+with *release*) are updated from *develop* when a "stable release" is
+made. In between stable releases, selected bug fixes and infrastructure
+updates are back-ported from the *develop* branch to the *maintenance*
+branch and occasionally merged to *stable* as an update release.

 Furthermore, the naming of the release tags now follow the pattern
 "patch_<Day><Month><Year>" to simplify comparisons between releases.
--- a/doc/src/Install_git.rst
+++ b/doc/src/Install_git.rst
@ -28,16 +28,16 @@ provides `limited support for subversion clients <svn_>`_.

 You can follow the LAMMPS development on 4 different git branches:

-* **release**  :  this branch is updated with every patch or feature release;
-  updates are always "fast-forward" merges from *develop*
-* **develop**  :  this branch follows the ongoing development and
-  is updated with every merge commit of a pull request
-* **stable**   :  this branch is updated from the *release* branch with
-  every stable release version and also has selected bug fixes with every
-  update release when the *maintenance* branch is merged into it
-* **maintenance**  :  this branch collects back-ported bug fixes from the
-  *develop* branch to the *stable* branch. It is used to update *stable*
-  for update releases and it synchronized with *stable* at each stable release.
+* **develop** : this branch follows the ongoing development and is
+  updated with every merge commit of a pull request
+* **release** : this branch is updated with every "feature release";
+   updates are always "fast-forward" merges from *develop*
+* **maintenance** : this branch collects back-ported bug fixes from the
+  *develop* branch to the *stable* branch.  It is used to update the
+  *stable* branch for "stable update releases".
+* **stable** : this branch is updated from the *release* branch with
+  every "stable release" version and also has selected bug fixes with
+  every "update release" when the *maintenance* branch is merged into it

 To access the git repositories on your box, use the clone command to
 create a local copy of the LAMMPS repository with a command like:
--- a/doc/src/JPG/lammps-classes.png
+++ b/doc/src/JPG/lammps-classes.png
--- a/doc/src/JPG/lammps-invoke-python.png
+++ b/doc/src/JPG/lammps-invoke-python.png
--- a/doc/src/JPG/lammps-releases.png
+++ b/doc/src/JPG/lammps-releases.png
--- a/doc/src/JPG/pylammps-invoke-lammps.png
+++ b/doc/src/JPG/pylammps-invoke-lammps.png
--- a/doc/src/JPG/python-invoke-lammps.png
+++ b/doc/src/JPG/python-invoke-lammps.png
--- a/doc/src/Manual_version.rst
+++ b/doc/src/Manual_version.rst
@ -3,45 +3,25 @@ What does a LAMMPS version mean

 The LAMMPS "version" is the date when it was released, such as 1 May
 2014.  LAMMPS is updated continuously, and we aim to keep it working
-correctly and reliably at all times.  You can follow its development
-in a public `git repository on GitHub <https://github.com/lammps/lammps>`_.
-
-Modifications of the LAMMPS source code (like bug fixes, code refactors,
-updates to existing features, or addition of new features) are organized
-into pull requests.  Pull requests will be merged into the *develop*
-branch of the git repository after they pass automated testing and code
-review by the LAMMPS developers.  When a sufficient number of changes
-have accumulated *and* the *develop* branch version passes an extended
-set of automated tests, we release it as a *feature release*, which are
-currently made every 4 to 8 weeks.  The *release* branch of the git
-repository is updated with every such release.  A summary of the most
-important changes of the patch releases are on `this website page
-<https://www.lammps.org/bug.html>`_.  More detailed release notes are
-`available on GitHub <https://github.com/lammps/lammps/releases/>`_.
-
-Once or twice a year, we have a "stabilization period" where we apply
-only bug fixes and small, non-intrusive changes to the *develop*
-branch.  At the same time, the code is subjected to more detailed and
-thorough manual testing than the default automated testing.  Also,
-several variants of static code analysis are run to improve the overall
-code quality, consistency, and compliance with programming standards,
-best practices and style conventions.
-
-The release after such a stabilization period is called a *stable*
-version and both, the *release* and the *stable* branches are updated
-with it.  Between stable releases, we collect back-ported bug fixes and
-updates from the *develop* branch in the *maintenance* branch.  From the
-*maintenance* branch we make occasional update releases and update the
-*stable* branch accordingly.
+correctly and reliably at all times.  Also, several variants of static
+code analysis are run regularly to maintain or improve the overall code
+quality, consistency, and compliance with programming standards, best
+practices and style conventions.  You can follow its development in a
+public `git repository on GitHub <https://github.com/lammps/lammps>`_.

 Each version of LAMMPS contains all the documented *features* up to and
 including its version date.  For recently added features, we add markers
 to the documentation at which specific LAMMPS version a feature or
 keyword was added or significantly changed.

-The version date is printed to the screen and log file every time you run
-LAMMPS.  It is also in the file src/version.h and in the LAMMPS
-directory name created when you unpack a tarball.  And it is on the
+Identifying the Version
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The version date is printed to the screen and log file every time you
+run LAMMPS.  There also is an indication, if a LAMMPS binary was
+compiled from version with modifications **after** a release.
+It is also visible in the file src/version.h and in the LAMMPS directory
+name created when you unpack a downloaded tarball.  And it is on the
 first page of the :doc:`manual <Manual>`.

 * If you browse the HTML pages of the online version of the LAMMPS
@ -53,3 +33,56 @@ first page of the :doc:`manual <Manual>`.
 * If you browse the HTML pages included in your downloaded tarball, they
  describe the version you have, which may be older than the online
  version.
+
+LAMMPS releases, branches, and tags
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. figure:: JPG/lammps-releases.png
+   :figclass: align-center
+
+   Relations between releases, main branches, and tags in the LAMMPS git repository
+
+Development
+"""""""""""
+
+Modifications of the LAMMPS source code (like bug fixes, code
+refactoring, updates to existing features, or addition of new features)
+are organized into pull requests.  Pull requests will be merged into the
+*develop* branch of the git repository after they pass automated testing
+and code review by the LAMMPS developers.
+
+Feature Releases
+""""""""""""""""
+
+When a sufficient number of new features and updates have accumulated
+*and* the LAMMPS version on the *develop* branch passes an extended set
+of automated tests, we release it as a *feature release*, which are
+currently made every 4 to 8 weeks.  The *release* branch of the git
+repository is updated with every such *feature release* and a tag in the
+format ``patch_1May2014`` is added.  A summary of the most important
+changes of these releases for the current year are posted on `this
+website page <https://www.lammps.org/bug.html>`_.  More detailed release
+notes are `available on GitHub
+<https://github.com/lammps/lammps/releases/>`_.
+
+Stable Releases
+"""""""""""""""
+
+About once a year, we release a *stable release* version of LAMMPS.
+This is done after a "stabilization period" where we apply only bug
+fixes and small, non-intrusive changes to the *develop* branch but no
+new features.  At the same time, the code is subjected to more detailed
+and thorough manual testing than the default automated testing.
+After such a *stable release*, both the *release* and the *stable*
+branches are updated and two tags are applied, a ``patch_1May2014`` format
+and a ``stable_1May2014`` format tag.
+
+Stable Release Updates
+""""""""""""""""""""""
+
+Between *stable releases*, we collect bug fixes and updates back-ported
+from the *develop* branch in a branch called *maintenance*.  From the
+*maintenance* branch we make occasional *stable update releases* and
+update the *stable* branch accordingly.  The first update to the
+``stable_1May2014`` release would be tagged as
+``stable_1May2014_update1``.  These updates contain no new features.
--- a/doc/src/bond_bpm_rotational.rst
+++ b/doc/src/bond_bpm_rotational.rst
@ -147,8 +147,8 @@ By default, pair forces are not calculated between bonded particles.
 Pair forces can alternatively be overlaid on top of bond forces by setting
 the *overlay/pair* keyword to *yes*. These settings require specific
 :doc:`special_bonds <special_bonds>` settings described in the
-restrictions.  Further details can be found in the :doc:`how to
-<Howto_bpm>` page on BPMs.
+restrictions.  Further details can be found in the :doc:`how to <Howto_bpm>`
+page on BPMs.

 .. versionadded:: 28Mar2023

--- a/doc/src/bond_bpm_spring.rst
+++ b/doc/src/bond_bpm_spring.rst
@ -113,8 +113,8 @@ By default, pair forces are not calculated between bonded particles.
 Pair forces can alternatively be overlaid on top of bond forces by setting
 the *overlay/pair* keyword to *yes*. These settings require specific
 :doc:`special_bonds <special_bonds>` settings described in the
-restrictions.  Further details can be found in the :doc:`how to
-<Howto_bpm>` page on BPMs.
+restrictions.  Further details can be found in the :doc:`how to <Howto_bpm>`
+page on BPMs.

 .. versionadded:: 28Mar2023

--- a/doc/src/compute.rst
+++ b/doc/src/compute.rst
@ -282,6 +282,7 @@ The individual style names on the :doc:`Commands compute <Commands_compute>` pag
 * :doc:`ptm/atom <compute_ptm_atom>` - determines the local lattice structure based on the Polyhedral Template Matching method
 * :doc:`rattlers/atom <compute_rattlers_atom>` - identify under-coordinated rattler atoms
 * :doc:`rdf <compute_rdf>` - radial distribution function :math:`g(r)` histogram of group of atoms
+* :doc:`reaxff/atom <compute_reaxff_atom>` - extract ReaxFF bond information
 * :doc:`reduce <compute_reduce>` - combine per-atom quantities into a single global value
 * :doc:`reduce/chunk <compute_reduce_chunk>` - reduce per-atom quantities within each chunk
 * :doc:`reduce/region <compute_reduce>` - same as compute reduce, within a region
--- a/doc/src/compute_reaxff_atom.rst
+++ b/doc/src/compute_reaxff_atom.rst
@ -0,0 +1,97 @@
+.. index:: compute reaxff/atom
+.. index:: compute reaxff/atom/kk
+
+compute reaxff/atom command
+===========================
+
+Accelerator Variants: *reaxff/atom/kk*
+
+Syntax
+""""""
+
+.. code-block:: LAMMPS
+
+   compute ID group-ID reaxff/atom attribute args ... keyword value ...
+
+* ID, group-ID are documented in :doc:`compute <compute>` command
+* reaxff/atom = name of this compute command
+* attribute = *pair*
+
+  .. parsed-literal::
+
+       *pair* args = nsub
+         nsub = *n*-instance of a sub-style, if a pair style is used multiple times in a hybrid style
+
+* keyword = *bonds*
+
+  .. parsed-literal::
+
+       *bonds* value = *no* or *yes*
+         *no* = ignore list of local bonds
+         *yes* = include list of local bonds
+
+Examples
+""""""""
+
+.. code-block:: LAMMPS
+
+   compute 1 all reaxff/atom bonds yes
+
+Description
+"""""""""""
+
+.. versionadded:: TBD
+
+Define a computation that extracts bond information computed by the ReaxFF
+potential specified by :doc:`pair_style reaxff <pair_reaxff>`.
+
+By default, it produces per-atom data that includes the following columns:
+
+* abo = atom bond order (sum of all bonds)
+* nlp = number of lone pairs
+* nb = number of bonds
+
+Bonds will only be included if its atoms are in the group.
+
+In addition, if ``bonds`` is set to ``yes``, the compute will also produce a
+local array of all bonds on the current processor whose atoms are in the group.
+The columns of each entry of this local array are:
+
+* id_i = atom i id of bond
+* id_j = atom j id of bond
+* bo = bond order of bond
+
+Output info
+"""""""""""
+
+This compute calculates a per-atom array and local array depending on the
+number of keywords. The number of rows in the local array is the number of
+bonds as described above. Both per-atom and local array have 3 columns.
+
+The arrays can be accessed by any command that uses local and per-atom values
+from a compute as input.  See the :doc:`Howto output <Howto_output>` page for
+an overview of LAMMPS output options.
+
+----------
+
+.. include:: accel_styles.rst
+
+----------
+
+Restrictions
+""""""""""""
+
+The compute reaxff/atom command requires that the :doc:`pair_style reaxff
+<pair_reaxff>` is invoked.  This fix is part of the REAXFF package.  It is only
+enabled if LAMMPS was built with that package.  See the :doc:`Build package
+<Build_package>` page for more info.
+
+Related commands
+""""""""""""""""
+
+:doc:`pair_style reaxff <pair_reaxff>`
+
+Default
+"""""""
+
+The option defaults are *bonds* = *no*.
--- a/doc/src/compute_stress_mop.rst
+++ b/doc/src/compute_stress_mop.rst
@ -18,7 +18,7 @@ Syntax
 * style = *stress/mop* or *stress/mop/profile*
 * dir = *x* or *y* or *z* is the direction normal to the plane
 * args = argument specific to the compute style
-* keywords = *kin* or *conf* or *total* or *pair* or *bond* or *angle* (one or more can be specified)
+* keywords = *kin* or *conf* or *total* or *pair* or *bond* or *angle* or *dihedral* (one or more can be specified)

 .. parsed-literal::

@ -68,15 +68,13 @@ Verlet algorithm.

 .. versionadded:: 15Jun2023

-   contributions from bond and angle potentials
+   contributions from bond, angle and dihedral potentials

-Between one and six keywords can be used to indicate which contributions
+Between one and seven keywords can be used to indicate which contributions
 to the stress must be computed: total stress (total), kinetic stress
 (kin), configurational stress (conf), stress due to bond stretching
-(bond), stress due to angle bending (angle) and/or due to pairwise
-non-bonded interactions (pair).  The angle keyword is currently
-available only for the *stress/mop* command and **not** the
-*stress/mop/profile* command.
+(bond), stress due to angle bending (angle), stress due to dihedral terms (dihedral)
+and/or due to pairwise non-bonded interactions (pair).

 NOTE 1: The configurational stress is computed considering all pairs of
 atoms where at least one atom belongs to group group-ID.
@ -134,14 +132,9 @@ size does not change in time, and axis-aligned planes.
 The method only works with two-body pair interactions, because it
 requires the class method ``Pair::single()`` to be implemented, which is
 not possible for manybody potentials.  In particular, compute
-*stress/mop/profile* does not work with more than two-body pair
+*stress/mop/profile* and *stress/mop* do not work with more than two-body pair
 interactions, long range (kspace) interactions and
-angle/dihedral/improper intramolecular interactions. Similarly, compute
-*stress/mop* does not work with more than two-body pair interactions,
-long range (kspace) interactions and dihedral/improper intramolecular
-interactions but works with all bond interactions with the class method
-single() implemented and all angle interactions with the class method
-born_matrix() implemented.
+improper intramolecular interactions.

 Related commands
 """"""""""""""""
--- a/doc/src/fix_halt.rst
+++ b/doc/src/fix_halt.rst
@ -183,4 +183,4 @@ Related commands
 Default
 """""""

-The option defaults are error = hard, message = yes, and path = ".".
+The option defaults are error = soft, message = yes, and path = ".".
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@ -344,12 +344,10 @@ specify additional flags for the runtime build.

 ----------

-The *intel* style invokes settings associated with the use of the
-INTEL package.  All of its settings, except the *omp* and *mode*
-keywords, are ignored if LAMMPS was not built with Xeon Phi
-co-processor support.  All of its settings, including the *omp* and
-*mode* keyword are applicable if LAMMPS was built with co-processor
-support.
+The *intel* style invokes settings associated with the use of the INTEL
+package.  The keywords *balance*, *ghost*, *tpc*, and *tptask* are
+**only** applicable if LAMMPS was built with Xeon Phi co-processor
+support and are otherwise ignored.

 The *Nphi* argument sets the number of co-processors per node.
 This can be set to any value, including 0, if LAMMPS was not
@ -474,13 +472,13 @@ If the *neigh/thread* keyword is set to *off*, then the KOKKOS package
 threads only over atoms. However, for small systems, this may not expose
 enough parallelism to keep a GPU busy. When this keyword is set to *on*,
 the KOKKOS package threads over both atoms and neighbors of atoms. When
-using *neigh/thread* *on*, a full neighbor list must also be used. Using
-*neigh/thread* *on* may be slower for large systems, so this this option
-is turned on by default only when there are 16K atoms or less owned by
-an MPI rank and when using a full neighbor list. Not all KOKKOS-enabled
-potentials support this keyword yet, and only thread over atoms. Many
-simple pairwise potentials such as Lennard-Jones do support threading
-over both atoms and neighbors.
+using *neigh/thread* *on*, the :doc:`newton pair <newton>` setting must
+be "off". Using *neigh/thread* *on* may be slower for large systems, so
+this this option is turned on by default only when running on one or
+more GPUs and there are 16k atoms or less owned by an MPI rank. Not all
+KOKKOS-enabled potentials support this keyword yet, and only thread over
+atoms. Many simple pairwise potentials such as Lennard-Jones do support
+threading over both atoms and neighbors.

 If the *neigh/transpose* keyword is set to *off*, then the KOKKOS
 package will use the same memory layout for building the neighbor list on
@ -732,7 +730,7 @@ comm = device, sort = device, neigh/transpose = off, gpu/aware = on. When
 LAMMPS can safely detect that GPU-aware MPI is not available, the default value
 of gpu/aware becomes "off". For CPUs or Xeon Phis, the option defaults are
 neigh = half, neigh/qeq = half, newton = on, binsize = 0.0, comm = no, and sort
-= no.  The option neigh/thread = on when there are 16K atoms or less on an MPI
+= no. For GPUs, option neigh/thread = on when there are 16k atoms or less on an MPI
 rank, otherwise it is "off". These settings are made automatically by the
 required "-k on" :doc:`command-line switch <Run_options>`. You can change them
 by using the package kokkos command in your input script or via the :doc:`-pk
--- a/doc/src/pair_aip_water_2dm.rst
+++ b/doc/src/pair_aip_water_2dm.rst
@ -22,13 +22,24 @@ Examples
 .. code-block:: LAMMPS

   pair_style  hybrid/overlay aip/water/2dm 16.0 1
-   pair_coeff  * * aip/water/2dm  COH.aip.water.2dm C Ow Hw
+   pair_coeff  * * aip/water/2dm  CBNOH.aip.water.2dm C Ow Hw

   pair_style  hybrid/overlay aip/water/2dm 16.0 lj/cut/tip4p/long 2 3 1 1 0.1546 10 8.5
-   pair_coeff  2 2   lj/cut/tip4p/long    8.0313e-3  3.1589  # O-O
-   pair_coeff  2 3   lj/cut/tip4p/long    0.0        0.0     # O-H
-   pair_coeff  3 3   lj/cut/tip4p/long    0.0        0.0     # H-H
-   pair_coeff  * *   aip/water/2dm        COH.aip.water.2dm    C Ow Hw
+   pair_coeff  2 2   lj/cut/tip4p/long    8.0313e-3  3.1589      # O-O
+   pair_coeff  2 3   lj/cut/tip4p/long    0.0        0.0         # O-H
+   pair_coeff  3 3   lj/cut/tip4p/long    0.0        0.0         # H-H
+   pair_coeff  * *   aip/water/2dm        CBNOH.aip.water.2dm    C Ow Hw
+
+   pair_style  hybrid/overlay aip/water/2dm 16.0 lj/cut/tip4p/long 3 4 1 1 0.1546 10 8.5 coul/shield 16.0 1
+   pair_coeff  1*2 1*2   none
+   pair_coeff  3 3   lj/cut/tip4p/long    8.0313e-3  3.1589      # O-O
+   pair_coeff  3 4   lj/cut/tip4p/long    0.0        0.0         # O-H
+   pair_coeff  4 4   lj/cut/tip4p/long    0.0        0.0         # H-H
+   pair_coeff  * *   aip/water/2dm        CBNOH.aip.water.2dm  B N Ow Hw
+   pair_coeff  1 3   coul/shield          1.333
+   pair_coeff  1 4   coul/shield          1.333
+   pair_coeff  2 3   coul/shield          1.333
+   pair_coeff  2 4   coul/shield          1.333

 Description
 """""""""""
@ -37,7 +48,7 @@ Description

 The *aip/water/2dm* style computes the anisotropic interfacial potential
 (AIP) potential for interfaces of water with two-dimensional (2D)
-materials as described in :ref:`(Feng) <Feng>`.
+materials as described in :ref:`(Feng1) <Feng1>` and :ref:`(Feng2) <Feng2>`.

 .. math::

@ -62,12 +73,12 @@ larger than :math:`r_c` :doc:`pair_style ilp_graphene_hbn
 .. note::

   This pair style uses the atomic normal vector definition from
-   :ref:`(Feng) <Feng>`), where the atomic normal vectors of the
+   :ref:`(Feng1) <Feng1>`), where the atomic normal vectors of the
   hydrogen atoms are assumed to lie along the corresponding
   oxygen-hydrogen bonds and the normal vector of the central oxygen
   atom is defined as their average.

-The provided parameter file, ``COH.aip.water.2dm``, is intended for use
+The provided parameter file, ``CBNOH.aip.water.2dm``, is intended for use
 with *metal* :doc:`units <units>`, with energies in meV.  Two additional
 parameters, *S*, and *rcut* are included in the parameter file. *S* is
 designed to facilitate scaling of energies; *rcut* is the cutoff for an
@ -77,7 +88,7 @@ the calculation of the normals for all atom pairs.
 .. note::

   The parameters presented in the provided parameter file,
-   ``COH.aip.water.2dm``, are fitted with the taper function enabled by
+   ``CBNOH.aip.water.2dm``, are fitted with the taper function enabled by
   setting the cutoff equal to 16.0 Angstrom.  Using a different cutoff
   or taper function setting should be carefully checked as they can
   lead to significant errors.  These parameters provide a good
@ -134,7 +145,7 @@ if LAMMPS was built with that package.  See the :doc:`Build package
 This pair style requires the newton setting to be *on* for pair
 interactions.

-The ``COH.aip.water.2dm`` potential file provided with LAMMPS is
+The ``CBNOH.aip.water.2dm`` potential file provided with LAMMPS is
 parameterized for *metal* units.  You can use this pair style with any
 LAMMPS units, but you would need to create your own potential file with
 parameters in the appropriate units, if your simulation does not use
@ -162,6 +173,10 @@ tap_flag = 1

 ----------

-.. _Feng:
+.. _Feng1:

-**(Feng)** Z. Feng and W. Ouyang et al., J. Phys. Chem. C. 127, 8704-8713 (2023).
+**(Feng1)** Z. Feng, ..., and W. Ouyang, J. Phys. Chem. C. 127(18), 8704-8713 (2023).
+
+.. _Feng2:
+
+**(Feng2)** Z. Feng, ..., and W. Ouyang, Langmuir 39(50), 18198-18207 (2023).
--- a/doc/src/pair_ilp_tmd.rst
+++ b/doc/src/pair_ilp_tmd.rst
@ -22,12 +22,12 @@ Examples
 .. code-block:: LAMMPS

   pair_style  hybrid/overlay ilp/tmd 16.0 1
-   pair_coeff  * * ilp/tmd  MoS2.ILP Mo S S
+   pair_coeff  * * ilp/tmd  TMD.ILP Mo S S

   pair_style  hybrid/overlay sw/mod sw/mod ilp/tmd 16.0
   pair_coeff  * * sw/mod 1  tmd.sw.mod Mo S S NULL NULL NULL
-   pair_coeff  * * sw/mod 2  tmd.sw.mod NULL NULL NULL Mo S S
-   pair_coeff  * * ilp/tmd   MoS2.ILP   Mo S S Mo S S
+   pair_coeff  * * sw/mod 2  tmd.sw.mod NULL NULL NULL W Se Se
+   pair_coeff  * * ilp/tmd   TMD.ILP   Mo S S W Se Se

 Description
 """""""""""
@ -36,7 +36,7 @@ Description

 The *ilp/tmd* style computes the registry-dependent interlayer
 potential (ILP) potential for transition metal dichalcogenides (TMD)
-as described in :ref:`(Ouyang7) <Ouyang7>`.
+as described in :ref:`(Ouyang7) <Ouyang7>` and :ref:`(Jiang) <Jiang>`.

 .. math::

@ -69,7 +69,7 @@ calculating the normals.
   each atom `i`, its six nearest neighboring atoms belonging to the same
   sub-layer are chosen to define the normal vector `{\bf n}_i`.

-The parameter file (e.g. MoS2.ILP), is intended for use with *metal*
+The parameter file (e.g. TMD.ILP), is intended for use with *metal*
 :doc:`units <units>`, with energies in meV. Two additional parameters,
 *S*, and *rcut* are included in the parameter file. *S* is designed to
 facilitate scaling of energies. *rcut* is designed to build the neighbor
@ -77,7 +77,7 @@ list for calculating the normals for each atom pair.

 .. note::

-   The parameters presented in the parameter file (e.g. MoS2.ILP),
+   The parameters presented in the parameter file (e.g. TMD.ILP),
   are fitted with taper function by setting the cutoff equal to 16.0
   Angstrom.  Using different cutoff or taper function should be careful.
   These parameters provide a good description in both short- and long-range
@ -133,10 +133,10 @@ if LAMMPS was built with that package.  See the :doc:`Build package
 This pair style requires the newton setting to be *on* for pair
 interactions.

-The MoS2.ILP potential file provided with LAMMPS (see the potentials
+The TMD.ILP potential file provided with LAMMPS (see the potentials
 directory) are parameterized for *metal* units.  You can use this
 potential with any LAMMPS units, but you would need to create your own
-custom MoS2.ILP potential file with coefficients listed in the appropriate
+custom TMD.ILP potential file with coefficients listed in the appropriate
 units, if your simulation does not use *metal* units.

 Related commands
@ -164,3 +164,7 @@ tap_flag = 1
 .. _Ouyang7:

 **(Ouyang7)** W. Ouyang, et al., J. Chem. Theory Comput. 17, 7237 (2021).
+
+.. _Jiang:
+
+**(Jiang)** W. Jiang, et al., J. Phys. Chem. A, 127, 46, 9820-9830 (2023).
--- a/doc/src/pair_reaxff.rst
+++ b/doc/src/pair_reaxff.rst
@ -373,7 +373,8 @@ Related commands

 :doc:`pair_coeff <pair_coeff>`, :doc:`fix qeq/reaxff <fix_qeq_reaxff>`,
 :doc:`fix acks2/reaxff <fix_acks2_reaxff>`, :doc:`fix reaxff/bonds <fix_reaxff_bonds>`,
-:doc:`fix reaxff/species <fix_reaxff_species>`
+:doc:`fix reaxff/species <fix_reaxff_species>`,
+:doc:`compute reaxff/atom <compute_reaxff_atom>`

 Default
 """""""
--- a/doc/src/run_style.rst
+++ b/doc/src/run_style.rst
@ -329,7 +329,8 @@ Restrictions
 The *verlet/split* style can only be used if LAMMPS was built with the
 REPLICA package. Correspondingly the *respa/omp* style is available
 only if the OPENMP package was included. See the :doc:`Build package
-<Build_package>` page for more info.
+<Build_package>` page for more info.  It is not compatible with
+kspace styles from the INTEL package.

 Whenever using rRESPA, the user should experiment with trade-offs in
 speed and accuracy for their system, and verify that they are
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@ -792,6 +792,7 @@ dispersionflag
 dissipative
 Dissipative
 distharm
+distutils
 dl
 dlabel
 dlambda
--- a/examples/COUPLE/simple/simple.cpp
+++ b/examples/COUPLE/simple/simple.cpp
@ -67,7 +67,7 @@ int main(int narg, char **arg)
  FILE *fp;
  if (me == 0) {
    fp = fopen(arg[2],"r");
-    if (fp == NULL) {
+    if (fp == nullptr) {
      printf("ERROR: Could not open LAMMPS input script\n");
      MPI_Abort(MPI_COMM_WORLD,1);
    }
@ -78,14 +78,14 @@ int main(int narg, char **arg)
  // (could just send it to proc 0 of comm_lammps and let it Bcast)
  // all LAMMPS procs call input->one() on the line
  
-  LAMMPS *lmp = NULL;
-  if (lammps == 1) lmp = new LAMMPS(0,NULL,comm_lammps);
+  LAMMPS *lmp = nullptr;
+  if (lammps == 1) lmp = new LAMMPS(0,nullptr,comm_lammps);

  int n;
  char line[1024];
-  while (1) {
+  while (true) {
    if (me == 0) {
-      if (fgets(line,1024,fp) == NULL) n = 0;
+      if (fgets(line,1024,fp) == nullptr) n = 0;
      else n = strlen(line) + 1;
      if (n == 0) fclose(fp);
    }
@ -101,8 +101,8 @@ int main(int narg, char **arg)
  // put coords back into LAMMPS
  // run a single step with changed coords

-  double *x = NULL;
-  double *v = NULL;
+  double *x = nullptr;
+  double *v = nullptr;

  if (lammps == 1) {
    lmp->input->one("run 10");
@ -147,7 +147,7 @@ int main(int narg, char **arg)
  // create_atoms() to create new ones with old coords, vels
  // initial thermo should be same as step 20

-  int *type = NULL;
+  int *type = nullptr;

  if (lammps == 1) {
    int natoms = static_cast<int> (lmp->atom->natoms);
@ -155,7 +155,7 @@ int main(int narg, char **arg)
    for (int i = 0; i < natoms; i++) type[i] = 1;

    lmp->input->one("delete_atoms group all");
-    lammps_create_atoms(lmp,natoms,NULL,type,x,v,NULL,0);
+    lammps_create_atoms(lmp,natoms,nullptr,type,x,v,nullptr,0);
    lmp->input->one("run 10");
  }

--- a/examples/PACKAGES/interlayer/aip_water_2dm/CBNOH.aip.water.2dm
+++ b/examples/PACKAGES/interlayer/aip_water_2dm/CBNOH.aip.water.2dm
@ -0,0 +1 @@
+../../../../potentials/CBNOH.aip.water.2dm
--- a/examples/PACKAGES/interlayer/aip_water_2dm/COH.aip.water.2dm
+++ b/examples/PACKAGES/interlayer/aip_water_2dm/COH.aip.water.2dm
@ -1 +0,0 @@
-../../../../potentials/COH.aip.water.2dm
--- a/examples/PACKAGES/interlayer/ilp_tmds/MoS2.ILP
+++ b/examples/PACKAGES/interlayer/ilp_tmds/MoS2.ILP
@ -1 +0,0 @@
-../../../../potentials/MoS2.ILP
--- a/examples/PACKAGES/interlayer/ilp_tmds/TMD.ILP
+++ b/examples/PACKAGES/interlayer/ilp_tmds/TMD.ILP
@ -0,0 +1 @@
+../../../../potentials/TMD.ILP
--- a/examples/PACKAGES/interlayer/ilp_tmds/in.mos2
+++ b/examples/PACKAGES/interlayer/ilp_tmds/in.mos2
@ -12,7 +12,7 @@ mass            4 95.94
 pair_style     	hybrid/overlay sw/mod sw/mod ilp/tmd 16.0
 pair_coeff     	* * sw/mod 1 tmd.sw.mod Mo S S NULL NULL NULL
 pair_coeff     	* * sw/mod 2 tmd.sw.mod NULL NULL NULL Mo S S
-pair_coeff     	* * ilp/tmd  MoS2.ILP Mo S S Mo S S
+pair_coeff     	* * ilp/tmd  TMD.ILP Mo S S Mo S S

 # Calculate the pair potential
 compute   	0 all pair ilp/tmd
--- a/examples/reaxff/in.reaxff.tatb
+++ b/examples/reaxff/in.reaxff.tatb
@ -31,8 +31,15 @@ neigh_modify    delay 0 every 5 check no
 fix             1 all nve
 fix             2 all qeq/reaxff 1 0.0 10.0 1.0e-6 reaxff
 fix             4 all reaxff/bonds 5 bonds.reaxff
+compute         bonds all reaxff/atom bonds yes
 variable nqeq   equal f_2

+# dumps out the local bond information
+dump 1 all local 5 bonds_local.reaxff c_bonds[1] c_bonds[2] c_bonds[3]
+
+# dumps out the peratom bond information
+dump 2 all custom 5 bonds_atom.reaxff id type q c_bonds[*]
+
 thermo          5
 thermo_style    custom step temp epair etotal press &
                v_eb v_ea v_elp v_emol v_ev v_epen v_ecoa &
--- a/lib/kokkos/core/src/Kokkos_Printf.hpp
+++ b/lib/kokkos/core/src/Kokkos_Printf.hpp
@ -31,7 +31,7 @@ namespace Kokkos {
 // backends. The GPU backends always return 1 and NVHPC only compiles if we
 // don't ask for the return value.
 template <typename... Args>
-KOKKOS_FUNCTION void printf(const char* format, Args... args) {
+KOKKOS_FORCEINLINE_FUNCTION void printf(const char* format, Args... args) {
 #ifdef KOKKOS_ENABLE_SYCL
  // Some compilers warn if "args" is empty and format is not a string literal
  if constexpr (sizeof...(Args) == 0)
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp
@ -219,8 +219,6 @@ KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions,
    Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions,
                                  partition_size);

-    OpenMP::memory_space space;
-
 #pragma omp parallel num_threads(num_partitions)
    {
      Exec thread_local_instance(partition_size);
--- a/potentials/CBNOH.aip.water.2dm
+++ b/potentials/CBNOH.aip.water.2dm
@ -0,0 +1,58 @@
+# DATE: 2023-12-20 UNITS: metal CONTRIBUTOR: Wengen Ouyang w.g.ouyang@gmail.com
+# CITATION: Z. Feng, ..., and W. Ouyang, J. Phys. Chem. C 127(18), 8704 (2023).
+# CITATION: Z. Feng, ..., and W. Ouyang, Langmuir 39(50), 18198-18207 (2023).
+# Anisotropic Potential (AIP) for water/graphene and water/hBN heterojunctions
+# The parameters below are fitted against the PBE + MBD-NL (graphene/water) and SCAN (hBN/water) DFT reference data.
+
+# The parameters for bilayer graphene/graphene, graphene/hBN and hBN/hBN junctions are taken from
+# CITATION: Ouyang, Mandelli, Urbakh, Hod, Nano Letters 18, 6009-6016 (2018).
+#
+# -------------------Repulsion Potential ------------------++++++++++++++ Vdw Potential ++++++++++++++++************
+#    beta(A)    alpha    delta(A)  epsilon(meV)  C(meV)        d         sR      reff(A)   C6(meV*A^6)    S    rcut
+#
+# For water-graphene
+C Ow  5.453696  6.181724  1.250255   3.349092     0.687806  9.057065   1.232495   2.775772  100226.555031  1.0  2.0
+C Hw  2.553809  9.686644  1.964892  41.776171   -16.300128  9.015685   0.744155   2.415456    7409.128564  1.0  2.0
+Ow C  5.453696  6.181724  1.250255   3.349092     0.687806  9.057065   1.232495   2.775772  100226.555031  1.0  1.2
+Hw C  2.553809  9.686644  1.964892  41.776171   -16.300128  9.015685   0.744155   2.415456    7409.128564  1.0  1.2
+
+# For water-hBN
+N Ow  3.530598  16.377816  1.285374  1.717537    1.339337  24.797794   0.771411   3.928357   33589.850651  1.0  2.0
+N Hw  4.029390   5.360546  0.950352 15.945549   -1.486701  10.797276   1.352684   2.293775   41247.181447  1.0  2.0
+B Ow  3.907514   7.842519  2.380078 32.122737    1.190485  17.482482   0.788174   2.368217  139539.370785  1.0  2.0
+B Hw  3.804966   2.356248  1.114761  9.193309   -5.922514   9.000572   1.334703   1.746122   43796.489158  1.0  2.0
+Ow N  3.530598  16.377816  1.285374  1.717537    1.339337  24.797794   0.771411   3.928357   33589.850651  1.0  1.2
+Hw N  4.029390   5.360546  0.950352 15.945549   -1.486701  10.797276   1.352684   2.293775   41247.181447  1.0  1.2
+Ow B  3.907514   7.842519  2.380078 32.122737    1.190485  17.482482   0.788174   2.368217  139539.370785  1.0  1.2
+Hw B  3.804966   2.356248  1.114761  9.193309   -5.922514   9.000572   1.334703   1.746122   43796.489158  1.0  1.2
+
+# For graphene and hydrocarbons
+C C   3.205843  7.511126  1.235334  1.528338E-5  37.530428  15.499947  0.7954443  3.681440  25.714535E3   1.0   2.0
+H H   3.974540   6.53799  1.080633  0.6700556    0.8333833  15.022371  0.7490632  2.767223  1.6159581E3   1.0   1.2
+C H   2.642950  12.91410  1.020257  0.9750012    25.340996  15.222927  0.8115998  3.887324  5.6874617E3   1.0   1.5
+H C   2.642950  12.91410  1.020257  0.9750012    25.340996  15.222927  0.8115998  3.887324  5.6874617E3   1.0   1.5
+
+# For hBN
+B B   3.143737  9.825139  1.936405   2.7848400   14.495957   15.199263  0.7834022  3.682950  49.498013E3   1.0  2.0
+N N   3.443196  7.084490  1.747349   2.9139991   46.508553   15.020370  0.8008370  3.551843  14.810151E3   1.0  2.0
+B N   3.295257  7.224311  2.872667   1.3715032    0.4347152  14.594578  0.8044028  3.765728  24.669996E3   1.0  2.0
+B H   2.718657  9.214551  3.273063  14.015714    14.760509   15.084752  0.7768383  3.640866  7.9642467E3   1.0  1.5
+N B   3.295257  7.224311  2.872667   1.3715032    0.4347152  14.594578  0.8044028  3.765728  24.669996E3   1.0  2.0
+H B   2.718657  9.214551  3.273063  14.015714    14.760509   15.084752  0.7768383  3.640866  7.9642467E3   1.0  1.5
+
+# For graphene-hBN
+C B  3.303662  10.54415   2.926741  16.719972     0.3571734  15.305254  0.7001581  3.097327  30.162869E3   1.0  2.0
+C N  3.253564   8.825921  1.059550  18.344740    21.913573   15.000000  0.7234983  3.013117  19.063095E3   1.0  2.0
+B C  3.303662  10.54415   2.926741  16.719972     0.3571734  15.305254  0.7001581  3.097327  30.162869E3   1.0  2.0
+N C  3.253564   8.825921  1.059550  18.344740    21.913573   15.000000  0.7234983  3.013117  19.063095E3   1.0  2.0
+
+# The AIPs for other elements are turned off
+H Ow  5.453696   6.181724  1.250255  0.000000   0.000000    9.057065   1.232495   2.775772      0.000000   1.0  1.2
+H Hw  5.453696   6.181724  1.250255  0.000000   0.000000    9.057065   1.232495   2.775772      0.000000   1.0  1.2
+Ow H  5.453696   6.181724  1.250255  0.000000   0.000000    9.057065   1.232495   2.775772      0.000000   1.0  1.2
+Hw H  5.453696   6.181724  1.250255  0.000000   0.000000    9.057065   1.232495   2.775772      0.000000   1.0  1.2
+
+Ow Ow 5.453696   6.181724  1.250255  0.000000   0.000000    9.057065   1.232495   2.775772      0.000000   1.0  1.2
+Hw Hw 5.453696   6.181724  1.250255  0.000000   0.000000    9.057065   1.232495   2.775772      0.000000   1.0  1.2
+Ow Hw 5.453696   6.181724  1.250255  0.000000   0.000000    9.057065   1.232495   2.775772      0.000000   1.0  1.2
+Hw Ow 5.453696   6.181724  1.250255  0.000000   0.000000    9.057065   1.232495   2.775772      0.000000   1.0  1.2
--- a/potentials/COH.aip.water.2dm
+++ b/potentials/COH.aip.water.2dm
@ -1,28 +0,0 @@
-# DATE: 2022-12-02 UNITS: metal CONTRIBUTOR: Wengen Ouyang w.g.ouyang@gmail.com CITATION: Z. Feng, ..., and W. Ouyang, J. Phys. Chem. C 127, 8704 (2023).
-# Anisotropic Interfacial Potential (AIP) parameters for water/graphene heterojunctions
-# The parameters below are fitted against the PBE + MBD-NL DFT reference data from 2.5 A to 15 A.
-#
-# ----------------- Repulsion Potential ------------------++++++++++++++ Vdw Potential ++++++++++++++++************
-#    beta(A)    alpha    delta(A)  epsilon(meV)  C(meV)        d         sR      reff(A)   C6(meV*A^6)    S    rcut
-# For graphene and hydrocarbons
-C C  3.205843  7.511126  1.235334  1.528338E-5  37.530428  15.499947  0.7954443  3.681440  25.714535E3   1.0   2.0
-H H  3.974540   6.53799  1.080633  0.6700556    0.8333833  15.022371  0.7490632  2.767223  1.6159581E3   1.0   1.2
-C H  2.642950  12.91410  1.020257  0.9750012    25.340996  15.222927  0.8115998  3.887324  5.6874617E3   1.0   1.5
-H C  2.642950  12.91410  1.020257  0.9750012    25.340996  15.222927  0.8115998  3.887324  5.6874617E3   1.0   1.5
-
-# For water-graphene
-C Ow  5.45369612 6.18172364 1.25025450  3.34909245   0.68780636 9.05706482 1.23249498 2.77577173 100226.55503127   1.0   2.0
-C Hw  2.55380862 9.68664390 1.96489198 41.77617053 -16.30012807 9.01568534 0.74415463 2.41545571   7409.12856378   1.0   2.0
-Ow C  5.45369612 6.18172364 1.25025450  3.34909245   0.68780636 9.05706482 1.23249498 2.77577173 100226.55503127   1.0   1.2
-Hw C  2.55380862 9.68664390 1.96489198 41.77617053 -16.30012807 9.01568534 0.74415463 2.41545571   7409.12856378   1.0   1.2
-
-# # The ILPs for other systems are set to zero
-H Ow  5.45369612 6.18172364 1.25025450 0.00000000 0.00000000 9.05706482 1.23249498 2.77577173 0.00000000    1.0     1.2
-H Hw  5.45369612 6.18172364 1.25025450 0.00000000 0.00000000 9.05706482 1.23249498 2.77577173 0.00000000    1.0     1.2
-Ow H  5.45369612 6.18172364 1.25025450 0.00000000 0.00000000 9.05706482 1.23249498 2.77577173 0.00000000    1.0     1.2
-Hw H  5.45369612 6.18172364 1.25025450 0.00000000 0.00000000 9.05706482 1.23249498 2.77577173 0.00000000    1.0     1.2
-
-Ow Ow 5.45369612 6.18172364 1.25025450 0.00000000 0.00000000 9.05706482 1.23249498 2.77577173 0.00000000    1.0     1.2
-Hw Hw 5.45369612 6.18172364 1.25025450 0.00000000 0.00000000 9.05706482 1.23249498 2.77577173 0.00000000    1.0     1.2
-Ow Hw 5.45369612 6.18172364 1.25025450 0.00000000 0.00000000 9.05706482 1.23249498 2.77577173 0.00000000    1.0     1.2
-Hw Ow 5.45369612 6.18172364 1.25025450 0.00000000 0.00000000 9.05706482 1.23249498 2.77577173 0.00000000    1.0     1.2
--- a/potentials/TMD.ILP
+++ b/potentials/TMD.ILP
@ -0,0 +1,25 @@
+# DATE: 2021-12-02 UNITS: metal CONTRIBUTOR: Wengen Ouyang w.g.ouyang@gmail.com
+# CITATION: W. Ouyang, et al., J. Chem. Theory Comput. 17, 7237 (2021).
+# CITATION: W.  Jiang, et al., J. Phys. Chem. A, 127, 46, 9820–9830 (2023).
+# Interlayer Potential (ILP) for bilayer and bulk Group-VI Transition Metal Dichalcogenides.
+# The parameters below are fitted against the HSE + MBD-NL DFT reference data.
+#
+# -------------------- Repulsion Potential -------------------++++++++++++++++ Vdw Potential ++++++++++++++++*********
+#       beta(A)     alpha    delta(A)  epsilon(meV)  C(meV)         d         sR     reff(A)    C6(meV*A^6)    S   rcut
+Mo Mo  5.579450   9.377662   2.027222  144.151775   97.978570   89.437597  2.059031  5.122055  491850.316195  1.0  4.0
+W   W  5.530854   6.624992   1.983208    0.271792  140.174059  107.392585  1.356333  4.437591  691850.243962  1.0  4.0
+S   S  3.161402   8.093263   1.953140    4.586764  118.065466   58.809416  0.215367  4.299600  148811.243409  1.0  4.0
+Se Se  3.938627  10.515924   2.415783    3.012583   22.400612  116.864517  0.151121  5.884241  112506.195626  1.0  4.0
+Mo  W  5.412298   8.647128   2.108665   51.177950  184.342860  201.281256  2.547743  2.492287   99996.913401  1.0  4.0
+Mo  S  3.627152  19.971375   7.585031   76.101931    3.317496   45.720328  0.947470  4.410425  150597.857716  1.0  4.0
+Mo Se  6.196447   4.844134  14.362005    7.407221    0.058823   27.156223  0.976771  3.979186  786029.840651  1.0  4.0
+W   S  3.680136  11.163004  32.254117  110.019679   79.381335  138.340438  0.900750  8.875776  250600.809034  1.0  4.0
+W  Se  3.559392  20.638856   1.202717   20.478669  197.422484   10.005271  1.052738  3.815817  288321.561114  1.0  4.0
+S  Se  2.820092   7.491151   1.933323  141.532559  293.127817   90.470904  0.390492  4.170885  117688.987069  1.0  4.0
+# Symmetric Atom Pair
+W  Mo  5.412298   8.647128   2.108665   51.177950  184.342860  201.281256  2.547743  2.492287   99996.913401  1.0  4.0
+S  Mo  3.627152  19.971375   7.585031   76.101931    3.317496   45.720328  0.947470  4.410425  150597.857716  1.0  4.0
+Se Mo  6.196447   4.844134  14.362005    7.407221    0.058823   27.156223  0.976771  3.979186  786029.840651  1.0  4.0
+S   W  3.680136  11.163004  32.254117  110.019679   79.381335  138.340438  0.900750  8.875776  250600.809034  1.0  4.0
+Se  W  3.559392  20.638856   1.202717   20.478669  197.422484   10.005271  1.052738  3.815817  288321.561114  1.0  4.0
+Se  S  2.820092   7.491151   1.933323  141.532559  293.127817   90.470904  0.390492  4.170885  117688.987069  1.0  4.0
--- a/src/.gitignore
+++ b/src/.gitignore
@ -346,8 +346,12 @@
 /bond_bpm_spring.h
 /compute_nbond_atom.cpp
 /compute_nbond_atom.h
+/fix_bond_history.cpp
+/fix_bond_history.h
 /fix_nve_bpm_sphere.cpp
 /fix_nve_bpm_sphere.h
+/fix_update_special_bonds.cpp
+/fix_update_special_bonds.h
 /pair_bpm_spring.cpp
 /pair_bpm_spring.h

@ -633,6 +637,8 @@
 /compute_ptm_atom.h
 /compute_rattlers_atom.cpp
 /compute_rattlers_atom.h
+/compute_reaxff_atom.cpp
+/compute_reaxff_atom.h
 /compute_rigid_local.cpp
 /compute_rigid_local.h
 /compute_slcsa_atom.cpp
--- a/src/BPM/Install.sh
+++ b/src/BPM/Install.sh
@ -0,0 +1,49 @@
+# Install/unInstall package files in LAMMPS
+# mode = 0/1/2 for uninstall/install/update
+
+mode=$1
+
+# enforce using portable C locale
+LC_ALL=C
+export LC_ALL
+
+# arg1 = file, arg2 = file it depends on
+
+action () {
+  if (test $mode = 0) then
+    rm -f ../$1
+  elif (! cmp -s $1 ../$1) then
+    if (test -z "$2" || test -e ../$2) then
+      cp $1 ..
+      if (test $mode = 2) then
+        echo "  updating src/$1"
+      fi
+    fi
+  elif (test -n "$2") then
+    if (test ! -e ../$2) then
+      rm -f ../$1
+    fi
+  fi
+}
+
+# all package files with no dependencies
+
+for file in *.cpp *.h; do
+  test -f ${file} && action $file
+done
+
+# edit 2 Makefile.package files to include/exclude package info
+
+if (test $1 = 1) then
+
+  if (test -e ../Makefile.package) then
+    sed -i -e 's|^PKG_INC =[ \t]*|&-DLMP_BPM |' ../Makefile.package
+  fi
+
+elif (test $1 = 0) then
+
+  if (test -e ../Makefile.package) then
+    sed -i -e 's/[^ \t]*LMP_BPM[^ \t]* //' ../Makefile.package
+  fi
+
+fi
--- a/src/BPM/bond_bpm.cpp
+++ b/src/BPM/bond_bpm.cpp
@ -224,7 +224,7 @@ void BondBPM::settings(int narg, char **arg)

      ifix = modify->get_fix_by_id(id_fix_prop_atom);
      if (!ifix)
-        ifix = modify->add_fix(fmt::format("{} all property/atom {} {} {} ghost yes",
+        ifix = modify->add_fix(fmt::format("{} all property/atom d_{} d_{} d_{} ghost yes",
                                           id_fix_prop_atom, x_ref_id, y_ref_id, z_ref_id));

      int type_flag;
--- a/src/BPM/fix_bond_history.cpp
+++ b/src/BPM/fix_bond_history.cpp
--- a/src/BPM/fix_bond_history.h
+++ b/src/BPM/fix_bond_history.h
--- a/src/BPM/fix_update_special_bonds.cpp
+++ b/src/BPM/fix_update_special_bonds.cpp
@ -19,6 +19,7 @@
 #include "error.h"
 #include "force.h"
 #include "modify.h"
+#include "neighbor.h"
 #include "neigh_list.h"
 #include "pair.h"

@ -72,9 +73,6 @@ void FixUpdateSpecialBonds::setup(int /*vflag*/)
      force->special_coul[3] != 1.0)
    error->all(FLERR, "Fix update/special/bonds requires special Coulomb weights = 1,1,1");
  // Implies neighbor->special_flag = [X, 2, 1, 1]
-
-  if (utils::strmatch(force->pair_style, "^hybrid"))
-    error->all(FLERR, "Cannot use fix update/special/bonds with hybrid pair styles");
 }

 /* ----------------------------------------------------------------------
@ -155,44 +153,51 @@ void FixUpdateSpecialBonds::pre_exchange()

 void FixUpdateSpecialBonds::pre_force(int /*vflag*/)
 {
-  int i1, i2, j, jj, jnum;
+  int ilist, nlist, i1, i2, j, jj, jnum;
  int *jlist, *numneigh, **firstneigh;
  tagint tag1, tag2;
+  NeighList *list;

  int nlocal = atom->nlocal;
-
  tagint *tag = atom->tag;
-  NeighList *list = force->pair->list;    // may need to be generalized for pair hybrid*
-  numneigh = list->numneigh;
-  firstneigh = list->firstneigh;

  // In theory could communicate a list of broken bonds to neighboring processors here
  // to remove restriction that users use Newton bond off

-  for (auto const &it : new_broken_pairs) {
-    tag1 = it.first;
-    tag2 = it.second;
-    i1 = atom->map(tag1);
-    i2 = atom->map(tag2);
+  for (int ilist = 0; ilist < neighbor->nlist; ilist ++) {
+    list = neighbor->lists[ilist];

-    // Loop through atoms of owned atoms i j
-    if (i1 < nlocal) {
-      jlist = firstneigh[i1];
-      jnum = numneigh[i1];
-      for (jj = 0; jj < jnum; jj++) {
-        j = jlist[jj];
-        j &= SPECIALMASK;    // Clear special bond bits
-        if (tag[j] == tag2) jlist[jj] = j;
+    // Skip copied lists, will update original
+    if (list->copy) continue;
+
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+
+    for (auto const &it : new_broken_pairs) {
+      tag1 = it.first;
+      tag2 = it.second;
+      i1 = atom->map(tag1);
+      i2 = atom->map(tag2);
+
+      // Loop through atoms of owned atoms i j
+      if (i1 < nlocal) {
+        jlist = firstneigh[i1];
+        jnum = numneigh[i1];
+        for (jj = 0; jj < jnum; jj++) {
+          j = jlist[jj];
+          j &= SPECIALMASK;    // Clear special bond bits
+          if (tag[j] == tag2) jlist[jj] = j;
+        }
      }
-    }

-    if (i2 < nlocal) {
-      jlist = firstneigh[i2];
-      jnum = numneigh[i2];
-      for (jj = 0; jj < jnum; jj++) {
-        j = jlist[jj];
-        j &= SPECIALMASK;    // Clear special bond bits
-        if (tag[j] == tag1) jlist[jj] = j;
+      if (i2 < nlocal) {
+        jlist = firstneigh[i2];
+        jnum = numneigh[i2];
+        for (jj = 0; jj < jnum; jj++) {
+          j = jlist[jj];
+          j &= SPECIALMASK;    // Clear special bond bits
+          if (tag[j] == tag1) jlist[jj] = j;
+        }
      }
    }
  }
--- a/src/BPM/fix_update_special_bonds.h
+++ b/src/BPM/fix_update_special_bonds.h
--- a/src/BPM/pair_bpm_spring.cpp
+++ b/src/BPM/pair_bpm_spring.cpp
@ -19,6 +19,7 @@
 #include "force.h"
 #include "memory.h"
 #include "neigh_list.h"
+#include "neighbor.h"

 #include <cmath>

@ -202,6 +203,18 @@ void PairBPMSpring::coeff(int narg, char **arg)
  if (count == 0) error->all(FLERR, "Incorrect args for pair coefficients");
 }

+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairBPMSpring::init_style()
+{
+  if (comm->ghost_velocity == 0)
+    error->all(FLERR,"Pair bpm/spring requires ghost atoms store velocity");
+
+  neighbor->add_request(this);
+}
+
 /* ----------------------------------------------------------------------
   init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
--- a/src/BPM/pair_bpm_spring.h
+++ b/src/BPM/pair_bpm_spring.h
@ -31,6 +31,7 @@ class PairBPMSpring : public Pair {
  void compute(int, int) override;
  void settings(int, char **) override;
  void coeff(int, char **) override;
+  void init_style() override;
  double init_one(int, int) override;
  void write_restart(FILE *) override;
  void read_restart(FILE *) override;
--- a/src/DRUDE/fix_tgnh_drude.cpp
+++ b/src/DRUDE/fix_tgnh_drude.cpp
@ -1076,7 +1076,6 @@ void FixTGNHDrude::couple()

 void FixTGNHDrude::remap()
 {
-  int i;
  double oldlo,oldhi;
  double expfac;

--- a/src/Depend.sh
+++ b/src/Depend.sh
@ -99,6 +99,7 @@ fi

 if (test $1 = "EXTRA-PAIR") then
  depend GPU
+  depend KOKKOS
  depend OPENMP
 fi

--- a/src/EFF/fix_langevin_eff.cpp
+++ b/src/EFF/fix_langevin_eff.cpp
@ -137,7 +137,7 @@ void FixLangevinEff::post_force_no_tally()
  dof = domain->dimension * particles;
  fix_dof = 0;
  for (int i = 0; i < modify->nfix; i++)
-    fix_dof += modify->fix[i]->dof(igroup);
+    fix_dof += (int)modify->fix[i]->dof(igroup);

  // extra_dof = domain->dimension
  dof -= domain->dimension + fix_dof;
@ -306,7 +306,7 @@ void FixLangevinEff::post_force_tally()
  dof = domain->dimension * particles;
  fix_dof = 0;
  for (int i = 0; i < modify->nfix; i++)
-    fix_dof += modify->fix[i]->dof(igroup);
+    fix_dof += (int)modify->fix[i]->dof(igroup);

  // extra_dof = domain->dimension
  dof -= domain->dimension + fix_dof;
--- a/src/ELECTRODE/pppm_electrode.cpp
+++ b/src/ELECTRODE/pppm_electrode.cpp
@ -633,7 +633,9 @@ void PPPMElectrode::project_psi(double *vec, int sensor_grpbit)
  // project u_brick with weight matrix
  double **x = atom->x;
  int *mask = atom->mask;
-  double const scaleinv = 1.0 / (nx_pppm * ny_pppm * nz_pppm);
+  const bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  const double scaleinv = 1.0 / ngridtotal;
+
  for (int i = 0; i < atom->nlocal; i++) {
    if (!(mask[i] & sensor_grpbit)) continue;
    double v = 0.;
@ -1362,7 +1364,7 @@ double PPPMElectrode::compute_qopt()
  // each proc calculates contributions from every Pth grid point

  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
-  int nxy_pppm = nx_pppm * ny_pppm;
+  bigint nxy_pppm = (bigint) nx_pppm * ny_pppm;

  double qopt = 0.0;

--- a/src/EXTRA-COMPUTE/compute_rattlers_atom.cpp
+++ b/src/EXTRA-COMPUTE/compute_rattlers_atom.cpp
@ -144,7 +144,6 @@ void ComputeRattlersAtom::compute_peratom()
  numneigh = list->numneigh;
  firstneigh = list->firstneigh;

-  Pair *pair = force->pair;
  double **cutsq = force->pair->cutsq;

  int change_flag = 1;
--- a/src/EXTRA-COMPUTE/compute_slcsa_atom.cpp
+++ b/src/EXTRA-COMPUTE/compute_slcsa_atom.cpp
@ -305,84 +305,84 @@ void ComputeSLCSAAtom::compute_peratom()

  int *mask = atom->mask;
  int nlocal = atom->nlocal;
-  double **compute_array;

  if (descriptorval.which == ArgInfo::COMPUTE) {
    if (!(descriptorval.val.c->invoked_flag & Compute::INVOKED_PERATOM)) {
      descriptorval.val.c->compute_peratom();
      descriptorval.val.c->invoked_flag |= Compute::INVOKED_PERATOM;
    }
-    compute_array = descriptorval.val.c->array_atom;
-  }
+    double **compute_array = descriptorval.val.c->array_atom;

-  memory->create(full_descriptor, ncomps, "slcsa/atom:local descriptor");
-  memory->create(projected_descriptor, nclasses - 1, "slcsa/atom:reduced descriptor");
-  memory->create(scores, nclasses, "slcsa/atom:scores");
-  memory->create(probas, nclasses, "slcsa/atom:probas");
-  memory->create(prodright, nclasses - 1, "slcsa/atom:prodright");
-  memory->create(dmaha, nclasses, "slcsa/atom:prodright");
+    memory->create(full_descriptor, ncomps, "slcsa/atom:local descriptor");
+    memory->create(projected_descriptor, nclasses - 1, "slcsa/atom:reduced descriptor");
+    memory->create(scores, nclasses, "slcsa/atom:scores");
+    memory->create(probas, nclasses, "slcsa/atom:probas");
+    memory->create(prodright, nclasses - 1, "slcsa/atom:prodright");
+    memory->create(dmaha, nclasses, "slcsa/atom:prodright");

-  for (int i = 0; i < nlocal; i++) {
-    if (mask[i] & groupbit) {
-      for (int j = 0; j < ncomps; j++) { full_descriptor[j] = compute_array[i][j]; }
-      // Here comes the LDA + LR process
-      // 1st step : Retrieve mean database descriptor
-      for (int j = 0; j < ncomps; j++) { full_descriptor[j] -= database_mean_descriptor[j]; }
-      // 2nd step : Matrix multiplication to go from ncompsx1 -> (nclasses-1)*1
-      for (int j = 0; j < nclasses - 1; j++) {
-        projected_descriptor[j] = 0.;
-        for (int k = 0; k < ncomps; k++) {
-          projected_descriptor[j] += full_descriptor[k] * lda_scalings[k][j];
-        }
-      }
-      // 3rd step : Matrix multiplication
-      for (int j = 0; j < nclasses; j++) {
-        scores[j] = lr_bias[j];
-        for (int k = 0; k < nclasses - 1; k++) {
-          scores[j] += lr_decision[j][k] * projected_descriptor[k];
-        }
-      }
-      // 4th step : Matrix multiplication
-      double sumexpscores = 0.;
-      for (int j = 0; j < nclasses; j++) sumexpscores += exp(scores[j]);
-      for (int j = 0; j < nclasses; j++) { probas[j] = exp(scores[j]) / sumexpscores; }
-
-      classification[i][nclasses] = argmax(probas, nclasses);
-
-      // 5th step : Mahalanobis distance
-      for (int j = 0; j < nclasses; j++) {
-        prodright[0] = 0.;
-        prodright[1] = 0.;
-        prodright[2] = 0.;
-        for (int k = 0; k < nclasses - 1; k++) {
-          for (int l = 0; l < nclasses - 1; l++) {
-            prodright[k] +=
-                (icov_list[j][k][l] * (projected_descriptor[k] - mean_projected_descriptors[j][k]));
+    for (int i = 0; i < nlocal; i++) {
+      if (mask[i] & groupbit) {
+        for (int j = 0; j < ncomps; j++) full_descriptor[j] = compute_array[i][j];
+        // Here comes the LDA + LR process
+        // 1st step : Retrieve mean database descriptor
+        for (int j = 0; j < ncomps; j++) full_descriptor[j] -= database_mean_descriptor[j];
+        // 2nd step : Matrix multiplication to go from ncompsx1 -> (nclasses-1)*1
+        for (int j = 0; j < nclasses - 1; j++) {
+          projected_descriptor[j] = 0.0;
+          for (int k = 0; k < ncomps; k++) {
+            projected_descriptor[j] += full_descriptor[k] * lda_scalings[k][j];
          }
        }
-        double prodleft = 0.;
-        for (int k = 0; k < nclasses - 1; k++) {
-          prodleft += (prodright[k] * (projected_descriptor[k] - mean_projected_descriptors[j][k]));
+        // 3rd step : Matrix multiplication
+        for (int j = 0; j < nclasses; j++) {
+          scores[j] = lr_bias[j];
+          for (int k = 0; k < nclasses - 1; k++) {
+            scores[j] += lr_decision[j][k] * projected_descriptor[k];
+          }
        }
-        classification[i][j] = sqrt(prodleft);
-      }
-      // 6th step : Sanity check
-      int locclass = classification[i][nclasses];
+        // 4th step : Matrix multiplication
+        double sumexpscores = 0.0;
+        for (int j = 0; j < nclasses; j++) sumexpscores += exp(scores[j]);
+        for (int j = 0; j < nclasses; j++) probas[j] = exp(scores[j]) / sumexpscores;

-      if (classification[i][locclass] > maha_thresholds[locclass]) {
-        classification[i][nclasses] = -1.;
-      }
+        classification[i][nclasses] = argmax(probas, nclasses);

-    } else {
-      for (int j = 0; j < ncols; j++) { classification[i][j] = -1.; }
+        // 5th step : Mahalanobis distance
+        for (int j = 0; j < nclasses; j++) {
+          prodright[0] = 0.0;
+          prodright[1] = 0.0;
+          prodright[2] = 0.0;
+          for (int k = 0; k < nclasses - 1; k++) {
+            for (int l = 0; l < nclasses - 1; l++) {
+              prodright[k] += (icov_list[j][k][l] *
+                               (projected_descriptor[k] - mean_projected_descriptors[j][k]));
+            }
+          }
+          double prodleft = 0.0;
+          for (int k = 0; k < nclasses - 1; k++) {
+            prodleft +=
+                (prodright[k] * (projected_descriptor[k] - mean_projected_descriptors[j][k]));
+          }
+          classification[i][j] = sqrt(prodleft);
+        }
+        // 6th step : Sanity check
+        int locclass = classification[i][nclasses];
+
+        if (classification[i][locclass] > maha_thresholds[locclass]) {
+          classification[i][nclasses] = -1.0;
+        }
+
+      } else {
+        for (int j = 0; j < ncols; j++) classification[i][j] = -1.0;
+      }
    }
+    memory->destroy(full_descriptor);
+    memory->destroy(projected_descriptor);
+    memory->destroy(scores);
+    memory->destroy(probas);
+    memory->destroy(prodright);
+    memory->destroy(dmaha);
  }
-  memory->destroy(full_descriptor);
-  memory->destroy(projected_descriptor);
-  memory->destroy(scores);
-  memory->destroy(probas);
-  memory->destroy(prodright);
-  memory->destroy(dmaha);
 }

 int ComputeSLCSAAtom::compute_ncomps(int twojmax)
--- a/src/EXTRA-COMPUTE/compute_stress_mop.cpp
+++ b/src/EXTRA-COMPUTE/compute_stress_mop.cpp
@ -23,6 +23,7 @@
 #include "atom_vec.h"
 #include "bond.h"
 #include "comm.h"
+#include "dihedral.h"
 #include "domain.h"
 #include "error.h"
 #include "force.h"
@ -38,8 +39,10 @@

 using namespace LAMMPS_NS;

+#define SMALL     0.001
+
 enum { X, Y, Z };
-enum { TOTAL, CONF, KIN, PAIR, BOND, ANGLE };
+enum { TOTAL, CONF, KIN, PAIR, BOND, ANGLE, DIHEDRAL };

 /* ---------------------------------------------------------------------- */

@ -49,6 +52,7 @@ ComputeStressMop::ComputeStressMop(LAMMPS *lmp, int narg, char **arg) : Compute(

  bondflag = 0;
  angleflag = 0;
+  dihedralflag = 0;

  // set compute mode and direction of plane(s) for pressure calculation

@ -129,6 +133,11 @@ ComputeStressMop::ComputeStressMop(LAMMPS *lmp, int narg, char **arg) : Compute(
        which[nvalues] = ANGLE;
        nvalues++;
      }
+    } else if (strcmp(arg[iarg],"dihedral") == 0) {
+      for (i=0; i<3; i++) {
+        which[nvalues] = DIHEDRAL;
+        nvalues++;
+      }
    } else
      error->all(FLERR, "Illegal compute stress/mop command");    //break;

@ -152,6 +161,8 @@ ComputeStressMop::ComputeStressMop(LAMMPS *lmp, int narg, char **arg) : Compute(
  bond_global = nullptr;
  angle_local = nullptr;
  angle_global = nullptr;
+  dihedral_local = nullptr;
+  dihedral_global = nullptr;

  // this fix produces a global vector

@ -162,6 +173,8 @@ ComputeStressMop::ComputeStressMop(LAMMPS *lmp, int narg, char **arg) : Compute(
  memory->create(bond_global, nvalues, "stress/mop:bond_global");
  memory->create(angle_local, nvalues, "stress/mop:angle_local");
  memory->create(angle_global, nvalues, "stress/mop:angle_global");
+  memory->create(dihedral_local,nvalues,"stress/mop:dihedral_local");
+  memory->create(dihedral_global,nvalues,"stress/mop:dihedral_global");
  size_vector = nvalues;

  vector_flag = 1;
@ -180,6 +193,8 @@ ComputeStressMop::~ComputeStressMop()
  memory->destroy(bond_global);
  memory->destroy(angle_local);
  memory->destroy(angle_global);
+  memory->destroy(dihedral_local);
+  memory->destroy(dihedral_global);
  memory->destroy(vector);
 }

@ -233,9 +248,13 @@ void ComputeStressMop::init()
      }
    }
    if (force->dihedral) {
-      if ((strcmp(force->dihedral_style, "zero") != 0) &&
-          (strcmp(force->dihedral_style, "none") != 0))
-        error->all(FLERR, "compute stress/mop does not account for dihedral potentials");
+      if (force->dihedral->born_matrix_enable == 0) {
+        if ((strcmp(force->dihedral_style, "zero") != 0) &&
+            (strcmp(force->dihedral_style, "none") != 0))
+          error->all(FLERR, "compute stress/mop does not account for dihedral potentials");
+      } else {
+        dihedralflag = 1;
+      }
    }
    if (force->improper) {
      if ((strcmp(force->improper_style, "zero") != 0) &&
@ -297,8 +316,18 @@ void ComputeStressMop::compute_vector()

  MPI_Allreduce(angle_local, angle_global, nvalues, MPI_DOUBLE, MPI_SUM, world);

+  if (dihedralflag) {
+    //Compute dihedral contribution on separate procs
+    compute_dihedrals();
+  } else {
+    for (int i=0; i<nvalues; i++) dihedral_local[i] = 0.0;
+  }
+
+  // sum dihedral contribution over all procs
+  MPI_Allreduce(dihedral_local,dihedral_global,nvalues,MPI_DOUBLE,MPI_SUM,world);
+
  for (int m = 0; m < nvalues; m++) {
-    vector[m] = values_global[m] + bond_global[m] + angle_global[m];
+    vector[m] = values_global[m] + bond_global[m] + angle_global[m] + dihedral_global[m];
  }
 }

@ -429,7 +458,12 @@ void ComputeStressMop::compute_pairs()
          xi[1] = atom->x[i][1];
          xi[2] = atom->x[i][2];

-          // velocities at t
+          // minimum image of xi with respect to the plane
+          xi[dir] -= pos;
+          domain->minimum_image(xi[0], xi[1], xi[2]);
+          xi[dir] += pos;
+
+          //velocities at t

          vi[0] = atom->v[i][0];
          vi[1] = atom->v[i][1];
@ -454,10 +488,8 @@ void ComputeStressMop::compute_pairs()
          // at each timestep, must check atoms going through the
          // image of the plane that is closest to the box

-          double pos_temp = pos + copysign(1.0, domain->prd_half[dir] - pos) * domain->prd[dir];
-          if (fabs(xi[dir] - pos) < fabs(xi[dir] - pos_temp)) pos_temp = pos;
-
-          if (((xi[dir] - pos_temp) * (xj[dir] - pos_temp)) < 0) {
+          double tau = (xi[dir] - pos) / (xi[dir] - xj[dir]);
+          if ((tau <= 1) && (tau >= 0)) {

            // sgn = copysign(1.0,vi[dir]-vcm[dir]);

@ -786,3 +818,308 @@ void ComputeStressMop::compute_angles()
    m += 3;
  }
 }
+
+/*------------------------------------------------------------------------
+  compute dihedral contribution to pressure of local proc
+  -------------------------------------------------------------------------*/
+
+void ComputeStressMop::compute_dihedrals()
+{
+  int i, nd, atom1, atom2, atom3, atom4, imol, iatom;
+  tagint tagprev;
+  double vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, vb3x, vb3y, vb3z;
+  double vb2xm, vb2ym, vb2zm;
+  double sb1, sb2, sb3, rb1, rb3, c0, b1mag2, b1mag, b2mag2;
+  double b2mag, b3mag2, b3mag, c2mag, ctmp, r12c1, c1mag, r12c2;
+  double s1, s2, s12, sc1, sc2, a11, a22, a33, a12, a13, a23;
+  double df[3], f1[3], f2[3], f3[3], f4[3];
+  double c, sx2, sy2, sz2, sin2;
+
+  double **x = atom->x;
+  tagint *tag = atom->tag;
+  int *num_dihedral = atom->num_dihedral;
+  tagint **dihedral_atom1 = atom->dihedral_atom1;
+  tagint **dihedral_atom2 = atom->dihedral_atom2;
+  tagint **dihedral_atom3 = atom->dihedral_atom3;
+  tagint **dihedral_atom4 = atom->dihedral_atom4;
+  int *mask = atom->mask;
+
+  int *molindex = atom->molindex;
+  int *molatom = atom->molatom;
+  Molecule **onemols = atom->avec->onemols;
+
+  int nlocal = atom->nlocal;
+  int molecular = atom->molecular;
+
+  // loop over all atoms and their dihedrals
+
+  Dihedral *dihedral = force->dihedral;
+
+  double dudih, du2dih;
+
+  double diffx[3] = {0.0, 0.0, 0.0};
+  double x_atom_1[3] = {0.0, 0.0, 0.0};
+  double x_atom_2[3] = {0.0, 0.0, 0.0};
+  double x_atom_3[3] = {0.0, 0.0, 0.0};
+  double x_atom_4[3] = {0.0, 0.0, 0.0};
+
+  // initialization
+  for (int i = 0; i < nvalues; i++) {
+    dihedral_local[i] = 0.0;
+  }
+  double local_contribution[3] = {0.0, 0.0, 0.0};
+
+  for (atom2 = 0; atom2 < nlocal; atom2++) {
+    if (!(mask[atom2] & groupbit)) continue;
+
+    if (molecular == Atom::MOLECULAR)
+      nd = num_dihedral[atom2];
+    else {
+      if (molindex[atom2] < 0) continue;
+      imol = molindex[atom2];
+      iatom = molatom[atom2];
+      nd = onemols[imol]->num_dihedral[iatom];
+    }
+
+    for (i = 0; i < nd; i++) {
+      if (molecular == 1) {
+        if (tag[atom2] != dihedral_atom2[atom2][i]) continue;
+          atom1 = atom->map(dihedral_atom1[atom2][i]);
+          atom3 = atom->map(dihedral_atom3[atom2][i]);
+          atom4 = atom->map(dihedral_atom4[atom2][i]);
+      } else {
+        if (tag[atom2] != onemols[imol]->dihedral_atom2[atom2][i]) continue;
+        tagprev = tag[atom2] - iatom - 1;
+        atom1 = atom->map(onemols[imol]->dihedral_atom1[atom2][i] + tagprev);
+        atom3 = atom->map(onemols[imol]->dihedral_atom3[atom2][i] + tagprev);
+        atom4 = atom->map(onemols[imol]->dihedral_atom4[atom2][i] + tagprev);
+      }
+
+      if (atom1 < 0 || !(mask[atom1] & groupbit)) continue;
+      if (atom3 < 0 || !(mask[atom3] & groupbit)) continue;
+      if (atom4 < 0 || !(mask[atom4] & groupbit)) continue;
+
+      // minimum image of atom1 with respect to the plane of interest
+      x_atom_1[0] = x[atom1][0];
+      x_atom_1[1] = x[atom1][1];
+      x_atom_1[2] = x[atom1][2];
+      x_atom_1[dir] -= pos;
+      domain->minimum_image(x_atom_1[0], x_atom_1[1], x_atom_1[2]);
+      x_atom_1[dir] += pos;
+
+      // minimum image of atom2 with respect to atom1
+      diffx[0] = x[atom2][0] - x_atom_1[0];
+      diffx[1] = x[atom2][1] - x_atom_1[1];
+      diffx[2] = x[atom2][2] - x_atom_1[2];
+      domain->minimum_image(diffx[0], diffx[1], diffx[2]);
+      x_atom_2[0] = x_atom_1[0] + diffx[0];
+      x_atom_2[1] = x_atom_1[1] + diffx[1];
+      x_atom_2[2] = x_atom_1[2] + diffx[2];
+
+      // minimum image of atom3 with respect to atom2
+      diffx[0] = x[atom3][0] - x_atom_2[0];
+      diffx[1] = x[atom3][1] - x_atom_2[1];
+      diffx[2] = x[atom3][2] - x_atom_2[2];
+      domain->minimum_image(diffx[0], diffx[1], diffx[2]);
+      x_atom_3[0] = x_atom_2[0] + diffx[0];
+      x_atom_3[1] = x_atom_2[1] + diffx[1];
+      x_atom_3[2] = x_atom_2[2] + diffx[2];
+
+      // minimum image of atom3 with respect to atom2
+      diffx[0] = x[atom4][0] - x_atom_3[0];
+      diffx[1] = x[atom4][1] - x_atom_3[1];
+      diffx[2] = x[atom4][2] - x_atom_3[2];
+      domain->minimum_image(diffx[0], diffx[1], diffx[2]);
+      x_atom_4[0] = x_atom_3[0] + diffx[0];
+      x_atom_4[1] = x_atom_3[1] + diffx[1];
+      x_atom_4[2] = x_atom_3[2] + diffx[2];
+
+      // check if any bond vector crosses the plane of interest
+      double tau_right = (x_atom_2[dir] - pos) / (x_atom_2[dir] - x_atom_1[dir]);
+      double tau_middle = (x_atom_3[dir] - pos) / (x_atom_3[dir] - x_atom_2[dir]);
+      double tau_left = (x_atom_4[dir] - pos) / (x_atom_4[dir] - x_atom_3[dir]);
+      bool right_cross = ((tau_right >=0) && (tau_right  <= 1));
+      bool middle_cross = ((tau_middle >= 0) && (tau_middle <= 1));
+      bool left_cross = ((tau_left >=0) && (tau_left <= 1));
+
+      // no bonds crossing the plane
+      if (!right_cross && !middle_cross && !left_cross) continue;
+
+      dihedral->born_matrix(i, atom1, atom2, atom3, atom4, dudih, du2dih);
+
+      // first bond
+      vb1x = x_atom_1[0] - x_atom_2[0];
+      vb1y = x_atom_1[1] - x_atom_2[1];
+      vb1z = x_atom_1[2] - x_atom_2[2];
+
+      // second bond
+      vb2x = x_atom_3[0] - x_atom_2[0];
+      vb2y = x_atom_3[1] - x_atom_2[1];
+      vb2z = x_atom_3[2] - x_atom_2[2];
+
+      vb2xm = -vb2x;
+      vb2ym = -vb2y;
+      vb2zm = -vb2z;
+
+      // third bond
+      vb3x = x_atom_4[0] - x_atom_3[0];
+      vb3y = x_atom_4[1] - x_atom_3[1];
+      vb3z = x_atom_4[2] - x_atom_3[2];
+
+      // c0 calculation
+      sb1 = 1.0 / (vb1x*vb1x + vb1y*vb1y + vb1z*vb1z);
+      sb2 = 1.0 / (vb2x*vb2x + vb2y*vb2y + vb2z*vb2z);
+      sb3 = 1.0 / (vb3x*vb3x + vb3y*vb3y + vb3z*vb3z);
+
+      rb1 = sqrt(sb1);
+      rb3 = sqrt(sb3);
+
+      c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
+      // 1st and 2nd angle
+      b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
+      b1mag = sqrt(b1mag2);
+      b2mag2 = vb2x*vb2x + vb2y*vb2y + vb2z*vb2z;
+      b2mag = sqrt(b2mag2);
+      b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
+      b3mag = sqrt(b3mag2);
+
+      ctmp = vb1x*vb2x + vb1y*vb2y + vb1z*vb2z;
+      r12c1 = 1.0 / (b1mag*b2mag);
+      c1mag = ctmp * r12c1;
+
+      ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
+      r12c2 = 1.0 / (b2mag*b3mag);
+      c2mag = ctmp * r12c2;
+
+      // cos and sin of 2 angles and final c
+      sin2 = MAX(1.0 - c1mag*c1mag,0.0);
+      sc1 = sqrt(sin2);
+      if (sc1 < SMALL) sc1 = SMALL;
+      sc1 = 1.0/sc1;
+
+      sin2 = MAX(1.0 - c2mag*c2mag,0.0);
+      sc2 = sqrt(sin2);
+      if (sc2 < SMALL) sc2 = SMALL;
+      sc2 = 1.0/sc2;
+
+      s1 = sc1 * sc1;
+      s2 = sc2 * sc2;
+      s12 = sc1 * sc2;
+      c = (c0 + c1mag*c2mag) * s12;
+
+      // error check
+      if (c > 1.0) c = 1.0;
+      if (c < -1.0) c = -1.0;
+
+      // forces on each particle
+      double a = dudih;
+      c = c * a;
+      s12 = s12 * a;
+      a11 = c*sb1*s1;
+      a22 = -sb2 * (2.0*c0*s12 - c*(s1+s2));
+      a33 = c*sb3*s2;
+      a12 = -r12c1 * (c1mag*c*s1 + c2mag*s12);
+      a13 = -rb1*rb3*s12;
+      a23 = r12c2 * (c2mag*c*s2 + c1mag*s12);
+
+      sx2  = a12*vb1x + a22*vb2x + a23*vb3x;
+      sy2  = a12*vb1y + a22*vb2y + a23*vb3y;
+      sz2  = a12*vb1z + a22*vb2z + a23*vb3z;
+
+      f1[0] = a11*vb1x + a12*vb2x + a13*vb3x;
+      f1[1] = a11*vb1y + a12*vb2y + a13*vb3y;
+      f1[2] = a11*vb1z + a12*vb2z + a13*vb3z;
+
+      f2[0] = -sx2 - f1[0];
+      f2[1] = -sy2 - f1[1];
+      f2[2] = -sz2 - f1[2];
+
+      f4[0] = a13*vb1x + a23*vb2x + a33*vb3x;
+      f4[1] = a13*vb1y + a23*vb2y + a33*vb3y;
+      f4[2] = a13*vb1z + a23*vb2z + a33*vb3z;
+
+      f3[0] = sx2 - f4[0];
+      f3[1] = sy2 - f4[1];
+      f3[2] = sz2 - f4[2];
+
+      // only right bond crossing the plane
+      if (right_cross && !middle_cross && !left_cross)
+      {
+        double sgn = copysign(1.0, x_atom_1[dir] - pos);
+        df[0] = sgn * f1[0];
+        df[1] = sgn * f1[1];
+        df[2] = sgn * f1[2];
+      }
+
+      // only middle bond crossing the plane
+      if (!right_cross && middle_cross && !left_cross)
+      {
+        double sgn = copysign(1.0, x_atom_2[dir] - pos);
+        df[0] = sgn * (f2[0] + f1[0]);
+        df[1] = sgn * (f2[1] + f1[1]);
+        df[2] = sgn * (f2[2] + f1[2]);
+      }
+
+      // only left bond crossing the plane
+      if (!right_cross && !middle_cross && left_cross)
+      {
+        double sgn = copysign(1.0, x_atom_4[dir] - pos);
+        df[0] = sgn * f4[0];
+        df[1] = sgn * f4[1];
+        df[2] = sgn * f4[2];
+      }
+
+      // only right & middle bonds crossing the plane
+      if (right_cross && middle_cross && !left_cross)
+      {
+        double sgn = copysign(1.0, x_atom_2[dir] - pos);
+        df[0] = sgn * f2[0];
+        df[1] = sgn * f2[1];
+        df[2] = sgn * f2[2];
+      }
+
+      // only right & left bonds crossing the plane
+      if (right_cross && !middle_cross && left_cross)
+      {
+        double sgn = copysign(1.0, x_atom_1[dir] - pos);
+        df[0] = sgn * (f1[0] + f4[0]);
+        df[1] = sgn * (f1[1] + f4[1]);
+        df[2] = sgn * (f1[2] + f4[2]);
+      }
+
+      // only middle & left bonds crossing the plane
+      if (!right_cross && middle_cross && left_cross)
+      {
+        double sgn = copysign(1.0, x_atom_3[dir] - pos);
+        df[0] = sgn * f3[0];
+        df[1] = sgn * f3[1];
+        df[2] = sgn * f3[2];
+      }
+
+      // all three bonds crossing the plane
+      if (right_cross && middle_cross && left_cross)
+      {
+        double sgn = copysign(1.0, x_atom_1[dir] - pos);
+        df[0] = sgn * (f1[0] + f3[0]);
+        df[1] = sgn * (f1[1] + f3[1]);
+        df[2] = sgn * (f1[2] + f3[2]);
+      }
+
+      local_contribution[0] += df[0]/area*nktv2p;
+      local_contribution[1] += df[1]/area*nktv2p;
+      local_contribution[2] += df[2]/area*nktv2p;
+    }
+  }
+
+  // loop over the keywords and if necessary add the dihedral contribution
+  int m = 0;
+  while (m < nvalues) {
+    if ((which[m] == CONF) || (which[m] == TOTAL) || (which[m] == DIHEDRAL)) {
+        dihedral_local[m] = local_contribution[0];
+        dihedral_local[m+1] = local_contribution[1];
+        dihedral_local[m+2] = local_contribution[2];
+    }
+    m += 3;
+  }
+
+}
--- a/src/EXTRA-COMPUTE/compute_stress_mop.h
+++ b/src/EXTRA-COMPUTE/compute_stress_mop.h
@ -40,15 +40,17 @@ class ComputeStressMop : public Compute {
  void compute_pairs();
  void compute_bonds();
  void compute_angles();
+  void compute_dihedrals();

  int nvalues, dir;
  int *which;

-  int bondflag, angleflag;
+  int bondflag, angleflag, dihedralflag;

  double *values_local, *values_global;
  double *bond_local, *bond_global;
  double *angle_local, *angle_global;
+  double *dihedral_local, *dihedral_global;
  double pos, pos1, dt, nktv2p, ftm2v;
  double area;
  class NeighList *list;
--- a/src/EXTRA-COMPUTE/compute_stress_mop_profile.cpp
+++ b/src/EXTRA-COMPUTE/compute_stress_mop_profile.cpp
@ -13,15 +13,17 @@

 /*------------------------------------------------------------------------
  Contributing Authors : Romain Vermorel (LFCR), Laurent Joly (ULyon)
-  Support for bonds added by : Evangelos Voyiatzis (NovaMechanics)
+  Support for bonds, angles and dihedrals added by : Evangelos Voyiatzis (NovaMechanics)
  --------------------------------------------------------------------------*/

 #include "compute_stress_mop_profile.h"

+#include "angle.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "bond.h"
 #include "comm.h"
+#include "dihedral.h"
 #include "domain.h"
 #include "error.h"
 #include "force.h"
@ -37,9 +39,10 @@

 using namespace LAMMPS_NS;

+#define SMALL     0.001
+
 enum { X, Y, Z };
-enum { LOWER, CENTER, UPPER, COORD };
-enum { TOTAL, CONF, KIN, PAIR, BOND };
+enum { TOTAL, CONF, KIN, PAIR, BOND, ANGLE, DIHEDRAL };

 /* ---------------------------------------------------------------------- */

@ -49,6 +52,8 @@ ComputeStressMopProfile::ComputeStressMopProfile(LAMMPS *lmp, int narg, char **a
  if (narg < 7) utils::missing_cmd_args(FLERR, "compute stress/mop/profile", error);

  bondflag = 0;
+  angleflag = 0;
+  dihedralflag = 0;

  // set compute mode and direction of plane(s) for pressure calculation

@ -63,15 +68,15 @@ ComputeStressMopProfile::ComputeStressMopProfile(LAMMPS *lmp, int narg, char **a

  // bin parameters

-  if (strcmp(arg[4], "lower") == 0)
-    originflag = LOWER;
-  else if (strcmp(arg[4], "center") == 0)
-    originflag = CENTER;
-  else if (strcmp(arg[4], "upper") == 0)
-    originflag = UPPER;
-  else
-    originflag = COORD;
-  if (originflag == COORD) origin = utils::numeric(FLERR, arg[4], false, lmp);
+  if (strcmp(arg[4], "lower") == 0) {
+    origin = domain->boxlo[dir];
+  } else if (strcmp(arg[4], "center") == 0) {
+    origin = 0.5 * (domain->boxlo[dir] + domain->boxhi[dir]);
+  } else if (strcmp(arg[4], "upper") == 0) {
+    origin = domain->boxhi[dir];
+  } else {
+    origin = utils::numeric(FLERR, arg[4], false, lmp);
+  }
  delta = utils::numeric(FLERR, arg[5], false, lmp);
  invdelta = 1.0 / delta;

@ -108,6 +113,16 @@ ComputeStressMopProfile::ComputeStressMopProfile(LAMMPS *lmp, int narg, char **a
        which[nvalues] = BOND;
        nvalues++;
      }
+    } else if (strcmp(arg[iarg], "angle") == 0) {
+      for (i = 0; i < 3; i++) {
+        which[nvalues] = ANGLE;
+        nvalues++;
+      }
+    } else if (strcmp(arg[iarg],"dihedral") == 0) {
+      for (i=0; i<3; i++) {
+        which[nvalues] = DIHEDRAL;
+        nvalues++;
+      }
    } else
      error->all(FLERR, "Illegal compute stress/mop/profile command");    //break;

@ -133,6 +148,10 @@ ComputeStressMopProfile::ComputeStressMopProfile(LAMMPS *lmp, int narg, char **a
  values_local = values_global = array = nullptr;
  bond_local = nullptr;
  bond_global = nullptr;
+  angle_local = nullptr;
+  angle_global = nullptr;
+  dihedral_local = nullptr;
+  dihedral_global = nullptr;
  local_contribution = nullptr;

  // bin setup
@ -161,6 +180,10 @@ ComputeStressMopProfile::~ComputeStressMopProfile()
  memory->destroy(values_global);
  memory->destroy(bond_local);
  memory->destroy(bond_global);
+  memory->destroy(angle_local);
+  memory->destroy(angle_global);
+  memory->destroy(dihedral_local);
+  memory->destroy(dihedral_global);
  memory->destroy(local_contribution);
  memory->destroy(array);
 }
@ -208,13 +231,25 @@ void ComputeStressMopProfile::init()

    if (force->bond) bondflag = 1;

-    if (force->angle)
-      if ((strcmp(force->angle_style, "zero") != 0) && (strcmp(force->angle_style, "none") != 0))
-        error->all(FLERR, "compute stress/mop/profile does not account for angle potentials");
-    if (force->dihedral)
-      if ((strcmp(force->dihedral_style, "zero") != 0) &&
-          (strcmp(force->dihedral_style, "none") != 0))
-        error->all(FLERR, "compute stress/mop/profile does not account for dihedral potentials");
+    if (force->angle) {
+      if (force->angle->born_matrix_enable == 0) {
+        if ((strcmp(force->angle_style, "zero") != 0) && (strcmp(force->angle_style, "none") != 0))
+          error->all(FLERR,"compute stress/mop/profile does not account for angle potentials");
+      } else {
+        angleflag = 1;
+      }
+    }
+
+    if (force->dihedral) {
+      if (force->dihedral->born_matrix_enable == 0) {
+        if ((strcmp(force->dihedral_style, "zero") != 0) &&
+            (strcmp(force->dihedral_style, "none") != 0))
+          error->all(FLERR, "compute stress/mop/profile does not account for dihedral potentials");
+      } else {
+        dihedralflag = 1;
+      }
+    }
+
    if (force->improper)
      if ((strcmp(force->improper_style, "zero") != 0) &&
          (strcmp(force->improper_style, "none") != 0))
@ -263,16 +298,43 @@ void ComputeStressMopProfile::compute_array()
  }

  // sum bond contribution over all procs
-
  MPI_Allreduce(&bond_local[0][0], &bond_global[0][0], nbins * nvalues, MPI_DOUBLE, MPI_SUM, world);

+  if (angleflag) {
+    //Compute angle contribution on separate procs
+    compute_angles();
+  } else {
+    for (int m = 0; m < nbins; m++) {
+      for (int i = 0; i < nvalues; i++) {
+        angle_local[m][i] = 0.0;
+      }
+    }
+  }
+
+  // sum angle contribution over all procs
+  MPI_Allreduce(&angle_local[0][0],&angle_global[0][0],nbins*nvalues,MPI_DOUBLE,MPI_SUM,world);
+
+  if (dihedralflag) {
+    //Compute dihedral contribution on separate procs
+    compute_dihedrals();
+  } else {
+    for (int m = 0; m < nbins; m++) {
+      for (int i = 0; i < nvalues; i++) {
+        dihedral_local[m][i] = 0.0;
+      }
+    }
+  }
+
+  // sum dihedral contribution over all procs
+  MPI_Allreduce(&dihedral_local[0][0],&dihedral_global[0][0],nbins*nvalues,MPI_DOUBLE,MPI_SUM,world);
+
  for (int ibin = 0; ibin < nbins; ibin++) {
-    array[ibin][0] = coord[ibin][0];
+    array[ibin][0] = coord[ibin];

    int mo = 1;
    int m = 0;
    while (m < nvalues) {
-      array[ibin][m + mo] = values_global[ibin][m] + bond_global[ibin][m];
+      array[ibin][m + mo] = values_global[ibin][m] + bond_global[ibin][m] + angle_global[ibin][m] + dihedral_global[ibin][m];
      m++;
    }
  }
@ -366,8 +428,8 @@ void ComputeStressMopProfile::compute_pairs()
          if (newton_pair || j < nlocal) {

            for (ibin = 0; ibin < nbins; ibin++) {
-              pos = coord[ibin][0];
-              pos1 = coordp[ibin][0];
+              pos = coord[ibin];
+              pos1 = coordp[ibin];

              // check if ij pair is across plane, add contribution to pressure

@ -392,8 +454,8 @@ void ComputeStressMopProfile::compute_pairs()
          } else {

            for (ibin = 0; ibin < nbins; ibin++) {
-              pos = coord[ibin][0];
-              pos1 = coordp[ibin][0];
+              pos = coord[ibin];
+              pos1 = coordp[ibin];

              //check if ij pair is across plane, add contribution to pressure

@ -454,15 +516,29 @@ void ComputeStressMopProfile::compute_pairs()
          xj[2] = xi[2] - vi[2] * dt + fi[2] * iterm * dt;

          for (ibin = 0; ibin < nbins; ibin++) {
-            pos = coord[ibin][0];
-            pos1 = coordp[ibin][0];
+            pos = coord[ibin];
+            pos1 = coordp[ibin];

-            if (((xi[dir] - pos) * (xj[dir] - pos) * (xi[dir] - pos1) * (xj[dir] - pos1) < 0)) {
+            // minimum image of xi with respect to the plane
+            xi[dir] -= pos;
+            domain->minimum_image(xi[0], xi[1], xi[2]);
+            xi[dir] += pos;
+
+            // minimum image of xj with respect to xi
+            xj[0] -= xi[0];
+            xj[1] -= xi[1];
+            xj[2] -= xi[2];
+            domain->minimum_image(xi[0], xi[1], xi[2]);
+            xj[0] += xi[0];
+            xj[1] += xi[1];
+            xj[2] += xi[2];
+
+            double tau = (xi[dir] - pos) / (xi[dir] - xj[dir]);
+            if ((tau <= 1) && (tau >= 0)) {

              sgn = copysign(1.0, vi[dir]);

-              // approximate crossing velocity by v(t-dt/2) (based on Velocity-Verlet alg.)
-
+              //approximate crossing velocity by v(t-dt/2) (based on Velocity-Verlet alg.)
              double vcross[3];
              vcross[0] = vi[0] - fi[0] * iterm;
              vcross[1] = vi[1] - fi[1] * iterm;
@ -549,7 +625,7 @@ void ComputeStressMopProfile::compute_bonds()
      if (btype <= 0) continue;

      for (int ibin = 0; ibin < nbins; ibin++) {
-        double pos = coord[ibin][0];
+        double pos = coord[ibin];

        // minimum image of atom1 with respect to the plane of interest

@ -607,6 +683,506 @@ void ComputeStressMopProfile::compute_bonds()
  }
 }

+/*------------------------------------------------------------------------
+  compute angle contribution to pressure of local proc
+  -------------------------------------------------------------------------*/
+
+void ComputeStressMopProfile::compute_angles()
+{
+  int na, atom1, atom2, atom3, imol, iatom, atype;
+  tagint tagprev;
+  double r1, r2, cos_theta;
+
+  double **x = atom->x;
+  tagint *tag = atom->tag;
+  int *num_angle = atom->num_angle;
+  tagint **angle_atom1 = atom->angle_atom1;
+  tagint **angle_atom2 = atom->angle_atom2;
+  tagint **angle_atom3 = atom->angle_atom3;
+  int **angle_type = atom->angle_type;
+  int *mask = atom->mask;
+
+  int *molindex = atom->molindex;
+  int *molatom = atom->molatom;
+  Molecule **onemols = atom->avec->onemols;
+
+  int nlocal = atom->nlocal;
+  int molecular = atom->molecular;
+
+  // loop over all atoms and their angles
+  Angle *angle = force->angle;
+
+  double duang, du2ang;
+  double dx[3] = {0.0, 0.0, 0.0};
+  double dx_left[3] = {0.0, 0.0, 0.0};
+  double dx_right[3] = {0.0, 0.0, 0.0};
+  double x_angle_left[3] = {0.0, 0.0, 0.0};
+  double x_angle_middle[3] = {0.0, 0.0, 0.0};
+  double x_angle_right[3] = {0.0, 0.0, 0.0};
+  double dcos_theta[3] = {0.0, 0.0, 0.0};
+
+  // initialization
+  for (int m = 0; m < nbins; m++) {
+    for (int i = 0; i < nvalues; i++) {
+      angle_local[m][i] = 0.0;
+    }
+    local_contribution[m][0] = 0.0;
+    local_contribution[m][1] = 0.0;
+    local_contribution[m][2] = 0.0;
+  }
+
+
+  for (atom2 = 0; atom2 < nlocal; atom2++) {
+    if (!(mask[atom2] & groupbit)) continue;
+
+    if (molecular == 1)
+      na = num_angle[atom2];
+    else {
+      if (molindex[atom2] < 0) continue;
+      imol = molindex[atom2];
+      iatom = molatom[atom2];
+      na = onemols[imol]->num_angle[iatom];
+    }
+
+    for (int i = 0; i < na; i++) {
+      if (molecular == 1) {
+        if (tag[atom2] != angle_atom2[atom2][i]) continue;
+        atype = angle_type[atom2][i];
+        atom1 = atom->map(angle_atom1[atom2][i]);
+        atom3 = atom->map(angle_atom3[atom2][i]);
+      } else {
+        if (tag[atom2] != onemols[imol]->angle_atom2[atom2][i]) continue;
+        atype = onemols[imol]->angle_type[atom2][i];
+        tagprev = tag[atom2] - iatom - 1;
+        atom1 = atom->map(onemols[imol]->angle_atom1[atom2][i] + tagprev);
+        atom3 = atom->map(onemols[imol]->angle_atom3[atom2][i] + tagprev);
+      }
+
+      if (atom1 < 0 || !(mask[atom1] & groupbit)) continue;
+      if (atom3 < 0 || !(mask[atom3] & groupbit)) continue;
+      if (atype <= 0) continue;
+
+      for (int ibin = 0; ibin<nbins; ibin++) {
+        double pos = coord[ibin];
+
+        // minimum image of atom1 with respect to the plane of interest
+        dx[0] = x[atom1][0];
+        dx[1] = x[atom1][1];
+        dx[2] = x[atom1][2];
+        dx[dir] -= pos;
+        domain->minimum_image(dx[0], dx[1], dx[2]);
+        x_angle_left[0] = dx[0];
+        x_angle_left[1] = dx[1];
+        x_angle_left[2] = dx[2];
+        x_angle_left[dir] += pos;
+
+        // minimum image of atom2 with respect to atom1
+        dx_left[0] = x[atom2][0] - x_angle_left[0];
+        dx_left[1] = x[atom2][1] - x_angle_left[1];
+        dx_left[2] = x[atom2][2] - x_angle_left[2];
+        domain->minimum_image(dx_left[0], dx_left[1], dx_left[2]);
+        x_angle_middle[0] = x_angle_left[0] + dx_left[0];
+        x_angle_middle[1] = x_angle_left[1] + dx_left[1];
+        x_angle_middle[2] = x_angle_left[2] + dx_left[2];
+
+        // minimum image of atom3 with respect to atom2
+        dx_right[0] = x[atom3][0] - x_angle_middle[0];
+        dx_right[1] = x[atom3][1] - x_angle_middle[1];
+        dx_right[2] = x[atom3][2] - x_angle_middle[2];
+        domain->minimum_image(dx_right[0], dx_right[1], dx_right[2]);
+        x_angle_right[0] = x_angle_middle[0] + dx_right[0];
+        x_angle_right[1] = x_angle_middle[1] + dx_right[1];
+        x_angle_right[2] = x_angle_middle[2] + dx_right[2];
+
+        // check if any bond vector crosses the plane of interest
+        double tau_right = (x_angle_right[dir] - pos) / (x_angle_right[dir] - x_angle_middle[dir]);
+        double tau_left = (x_angle_middle[dir] - pos) / (x_angle_middle[dir] - x_angle_left[dir]);
+        bool right_cross = ((tau_right >=0) && (tau_right  <= 1));
+        bool left_cross = ((tau_left >=0) && (tau_left <= 1));
+
+        // no bonds crossing the plane
+        if (!right_cross && !left_cross) continue;
+
+        // compute the cos(theta) of the angle
+        r1 = sqrt(dx_left[0]*dx_left[0] + dx_left[1]*dx_left[1] + dx_left[2]*dx_left[2]);
+        r2 = sqrt(dx_right[0]*dx_right[0] + dx_right[1]*dx_right[1] + dx_right[2]*dx_right[2]);
+        cos_theta = -(dx_right[0]*dx_left[0] + dx_right[1]*dx_left[1] + dx_right[2]*dx_left[2])/(r1*r2);
+
+        if (cos_theta >  1.0) cos_theta = 1.0;
+        if (cos_theta < -1.0) cos_theta = -1.0;
+
+        // The method returns derivative with regards to cos(theta)
+        angle->born_matrix(atype, atom1, atom2, atom3, duang, du2ang);
+        // only right bond crossing the plane
+        if (right_cross && !left_cross)
+        {
+          double sgn = copysign(1.0, x_angle_right[dir] - pos);
+          dcos_theta[0] = sgn*(dx_right[0]*cos_theta/r2 + dx_left[0]/r1)/r2;
+          dcos_theta[1] = sgn*(dx_right[1]*cos_theta/r2 + dx_left[1]/r1)/r2;
+          dcos_theta[2] = sgn*(dx_right[2]*cos_theta/r2 + dx_left[2]/r1)/r2;
+        }
+
+        // only left bond crossing the plane
+        if (!right_cross && left_cross)
+        {
+          double sgn = copysign(1.0, x_angle_left[dir] - pos);
+          dcos_theta[0] = -sgn*(dx_left[0]*cos_theta/r1 + dx_right[0]/r2)/r1;
+          dcos_theta[1] = -sgn*(dx_left[1]*cos_theta/r1 + dx_right[1]/r2)/r1;
+          dcos_theta[2] = -sgn*(dx_left[2]*cos_theta/r1 + dx_right[2]/r2)/r1;
+        }
+
+        // both bonds crossing the plane
+        if (right_cross && left_cross)
+        {
+          // due to right bond
+          double sgn = copysign(1.0, x_angle_middle[dir] - pos);
+          dcos_theta[0] = -sgn*(dx_right[0]*cos_theta/r2 + dx_left[0]/r1)/r2;
+          dcos_theta[1] = -sgn*(dx_right[1]*cos_theta/r2 + dx_left[1]/r1)/r2;
+          dcos_theta[2] = -sgn*(dx_right[2]*cos_theta/r2 + dx_left[2]/r1)/r2;
+
+          // due to left bond
+          dcos_theta[0] += sgn*(dx_left[0]*cos_theta/r1 + dx_right[0]/r2)/r1;
+          dcos_theta[1] += sgn*(dx_left[1]*cos_theta/r1 + dx_right[1]/r2)/r1;
+          dcos_theta[2] += sgn*(dx_left[2]*cos_theta/r1 + dx_right[2]/r2)/r1;
+        }
+
+        // final contribution of the given angle term
+        local_contribution[ibin][0] += duang*dcos_theta[0]/area*nktv2p;
+        local_contribution[ibin][1] += duang*dcos_theta[1]/area*nktv2p;
+        local_contribution[ibin][2] += duang*dcos_theta[2]/area*nktv2p;
+      }
+    }
+  }
+
+  // loop over the keywords and if necessary add the angle contribution
+  int m = 0;
+  while (m < nvalues) {
+    if (which[m] == CONF || which[m] == TOTAL || which[m] == ANGLE) {
+      for (int ibin = 0; ibin < nbins; ibin++) {
+        angle_local[ibin][m] = local_contribution[ibin][0];
+        angle_local[ibin][m+1] = local_contribution[ibin][1];
+        angle_local[ibin][m+2] = local_contribution[ibin][2];
+      }
+    }
+    m += 3;
+  }
+}
+
+/*------------------------------------------------------------------------
+  compute dihedral contribution to pressure of local proc
+  -------------------------------------------------------------------------*/
+
+void ComputeStressMopProfile::compute_dihedrals()
+{
+  int i, nd, atom1, atom2, atom3, atom4, imol, iatom;
+  tagint tagprev;
+  double vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, vb3x, vb3y, vb3z;
+  double vb2xm, vb2ym, vb2zm;
+  double sb1, sb2, sb3, rb1, rb3, c0, b1mag2, b1mag, b2mag2;
+  double b2mag, b3mag2, b3mag, c2mag, ctmp, r12c1, c1mag, r12c2;
+  double s1, s2, s12, sc1, sc2, a11, a22, a33, a12, a13, a23;
+  double df[3], f1[3], f2[3], f3[3], f4[3];
+  double c, sx2, sy2, sz2, sin2;
+
+  double **x = atom->x;
+  tagint *tag = atom->tag;
+  int *num_dihedral = atom->num_dihedral;
+  tagint **dihedral_atom1 = atom->dihedral_atom1;
+  tagint **dihedral_atom2 = atom->dihedral_atom2;
+  tagint **dihedral_atom3 = atom->dihedral_atom3;
+  tagint **dihedral_atom4 = atom->dihedral_atom4;
+  int *mask = atom->mask;
+
+  int *molindex = atom->molindex;
+  int *molatom = atom->molatom;
+  Molecule **onemols = atom->avec->onemols;
+
+  int nlocal = atom->nlocal;
+  int molecular = atom->molecular;
+
+  // loop over all atoms and their dihedrals
+
+  Dihedral *dihedral = force->dihedral;
+
+  double dudih, du2dih;
+
+  double diffx[3] = {0.0, 0.0, 0.0};
+  double x_atom_1[3] = {0.0, 0.0, 0.0};
+  double x_atom_2[3] = {0.0, 0.0, 0.0};
+  double x_atom_3[3] = {0.0, 0.0, 0.0};
+  double x_atom_4[3] = {0.0, 0.0, 0.0};
+
+  // initialization
+  for (int m = 0; m < nbins; m++) {
+    for (int i = 0; i < nvalues; i++) {
+      dihedral_local[m][i] = 0.0;
+    }
+    local_contribution[m][0] = 0.0;
+    local_contribution[m][1] = 0.0;
+    local_contribution[m][2] = 0.0;
+  }
+
+  for (atom2 = 0; atom2 < nlocal; atom2++) {
+    if (!(mask[atom2] & groupbit)) continue;
+
+    if (molecular == Atom::MOLECULAR)
+      nd = num_dihedral[atom2];
+    else {
+      if (molindex[atom2] < 0) continue;
+      imol = molindex[atom2];
+      iatom = molatom[atom2];
+      nd = onemols[imol]->num_dihedral[iatom];
+    }
+
+    for (i = 0; i < nd; i++) {
+      if (molecular == 1) {
+        if (tag[atom2] != dihedral_atom2[atom2][i]) continue;
+          atom1 = atom->map(dihedral_atom1[atom2][i]);
+          atom3 = atom->map(dihedral_atom3[atom2][i]);
+          atom4 = atom->map(dihedral_atom4[atom2][i]);
+      } else {
+        if (tag[atom2] != onemols[imol]->dihedral_atom2[atom2][i]) continue;
+        tagprev = tag[atom2] - iatom - 1;
+        atom1 = atom->map(onemols[imol]->dihedral_atom1[atom2][i] + tagprev);
+        atom3 = atom->map(onemols[imol]->dihedral_atom3[atom2][i] + tagprev);
+        atom4 = atom->map(onemols[imol]->dihedral_atom4[atom2][i] + tagprev);
+      }
+
+      if (atom1 < 0 || !(mask[atom1] & groupbit)) continue;
+      if (atom3 < 0 || !(mask[atom3] & groupbit)) continue;
+      if (atom4 < 0 || !(mask[atom4] & groupbit)) continue;
+
+      for (int ibin = 0; ibin<nbins; ibin++) {
+        double pos = coord[ibin];
+
+        // minimum image of atom1 with respect to the plane of interest
+        x_atom_1[0] = x[atom1][0];
+        x_atom_1[1] = x[atom1][1];
+        x_atom_1[2] = x[atom1][2];
+        x_atom_1[dir] -= pos;
+        domain->minimum_image(x_atom_1[0], x_atom_1[1], x_atom_1[2]);
+        x_atom_1[dir] += pos;
+
+        // minimum image of atom2 with respect to atom1
+        diffx[0] = x[atom2][0] - x_atom_1[0];
+        diffx[1] = x[atom2][1] - x_atom_1[1];
+        diffx[2] = x[atom2][2] - x_atom_1[2];
+        domain->minimum_image(diffx[0], diffx[1], diffx[2]);
+        x_atom_2[0] = x_atom_1[0] + diffx[0];
+        x_atom_2[1] = x_atom_1[1] + diffx[1];
+        x_atom_2[2] = x_atom_1[2] + diffx[2];
+
+        // minimum image of atom3 with respect to atom2
+        diffx[0] = x[atom3][0] - x_atom_2[0];
+        diffx[1] = x[atom3][1] - x_atom_2[1];
+        diffx[2] = x[atom3][2] - x_atom_2[2];
+        domain->minimum_image(diffx[0], diffx[1], diffx[2]);
+        x_atom_3[0] = x_atom_2[0] + diffx[0];
+        x_atom_3[1] = x_atom_2[1] + diffx[1];
+        x_atom_3[2] = x_atom_2[2] + diffx[2];
+
+        // minimum image of atom3 with respect to atom2
+        diffx[0] = x[atom4][0] - x_atom_3[0];
+        diffx[1] = x[atom4][1] - x_atom_3[1];
+        diffx[2] = x[atom4][2] - x_atom_3[2];
+        domain->minimum_image(diffx[0], diffx[1], diffx[2]);
+        x_atom_4[0] = x_atom_3[0] + diffx[0];
+        x_atom_4[1] = x_atom_3[1] + diffx[1];
+        x_atom_4[2] = x_atom_3[2] + diffx[2];
+
+        // check if any bond vector crosses the plane of interest
+        double tau_right = (x_atom_2[dir] - pos) / (x_atom_2[dir] - x_atom_1[dir]);
+        double tau_middle = (x_atom_3[dir] - pos) / (x_atom_3[dir] - x_atom_2[dir]);
+        double tau_left = (x_atom_4[dir] - pos) / (x_atom_4[dir] - x_atom_3[dir]);
+        bool right_cross = ((tau_right >=0) && (tau_right  <= 1));
+        bool middle_cross = ((tau_middle >= 0) && (tau_middle <= 1));
+        bool left_cross = ((tau_left >=0) && (tau_left <= 1));
+
+        // no bonds crossing the plane
+        if (!right_cross && !middle_cross && !left_cross) continue;
+
+        dihedral->born_matrix(i, atom1, atom2, atom3, atom4, dudih, du2dih);
+
+        // first bond
+        vb1x = x_atom_1[0] - x_atom_2[0];
+        vb1y = x_atom_1[1] - x_atom_2[1];
+        vb1z = x_atom_1[2] - x_atom_2[2];
+
+        // second bond
+        vb2x = x_atom_3[0] - x_atom_2[0];
+        vb2y = x_atom_3[1] - x_atom_2[1];
+        vb2z = x_atom_3[2] - x_atom_2[2];
+
+        vb2xm = -vb2x;
+        vb2ym = -vb2y;
+        vb2zm = -vb2z;
+
+        // third bond
+        vb3x = x_atom_4[0] - x_atom_3[0];
+        vb3y = x_atom_4[1] - x_atom_3[1];
+        vb3z = x_atom_4[2] - x_atom_3[2];
+
+        // c0 calculation
+        sb1 = 1.0 / (vb1x*vb1x + vb1y*vb1y + vb1z*vb1z);
+        sb2 = 1.0 / (vb2x*vb2x + vb2y*vb2y + vb2z*vb2z);
+        sb3 = 1.0 / (vb3x*vb3x + vb3y*vb3y + vb3z*vb3z);
+
+        rb1 = sqrt(sb1);
+        rb3 = sqrt(sb3);
+
+        c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
+        // 1st and 2nd angle
+        b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
+        b1mag = sqrt(b1mag2);
+        b2mag2 = vb2x*vb2x + vb2y*vb2y + vb2z*vb2z;
+        b2mag = sqrt(b2mag2);
+        b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
+        b3mag = sqrt(b3mag2);
+
+        ctmp = vb1x*vb2x + vb1y*vb2y + vb1z*vb2z;
+        r12c1 = 1.0 / (b1mag*b2mag);
+        c1mag = ctmp * r12c1;
+
+        ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
+        r12c2 = 1.0 / (b2mag*b3mag);
+        c2mag = ctmp * r12c2;
+
+        // cos and sin of 2 angles and final c
+        sin2 = MAX(1.0 - c1mag*c1mag,0.0);
+        sc1 = sqrt(sin2);
+        if (sc1 < SMALL) sc1 = SMALL;
+        sc1 = 1.0/sc1;
+
+        sin2 = MAX(1.0 - c2mag*c2mag,0.0);
+        sc2 = sqrt(sin2);
+        if (sc2 < SMALL) sc2 = SMALL;
+        sc2 = 1.0/sc2;
+
+        s1 = sc1 * sc1;
+        s2 = sc2 * sc2;
+        s12 = sc1 * sc2;
+        c = (c0 + c1mag*c2mag) * s12;
+
+        // error check
+        if (c > 1.0) c = 1.0;
+        if (c < -1.0) c = -1.0;
+
+        // forces on each particle
+        double a = dudih;
+        c = c * a;
+        s12 = s12 * a;
+        a11 = c*sb1*s1;
+        a22 = -sb2 * (2.0*c0*s12 - c*(s1+s2));
+        a33 = c*sb3*s2;
+        a12 = -r12c1 * (c1mag*c*s1 + c2mag*s12);
+        a13 = -rb1*rb3*s12;
+        a23 = r12c2 * (c2mag*c*s2 + c1mag*s12);
+
+        sx2  = a12*vb1x + a22*vb2x + a23*vb3x;
+        sy2  = a12*vb1y + a22*vb2y + a23*vb3y;
+        sz2  = a12*vb1z + a22*vb2z + a23*vb3z;
+
+        f1[0] = a11*vb1x + a12*vb2x + a13*vb3x;
+        f1[1] = a11*vb1y + a12*vb2y + a13*vb3y;
+        f1[2] = a11*vb1z + a12*vb2z + a13*vb3z;
+
+        f2[0] = -sx2 - f1[0];
+        f2[1] = -sy2 - f1[1];
+        f2[2] = -sz2 - f1[2];
+
+        f4[0] = a13*vb1x + a23*vb2x + a33*vb3x;
+        f4[1] = a13*vb1y + a23*vb2y + a33*vb3y;
+        f4[2] = a13*vb1z + a23*vb2z + a33*vb3z;
+
+        f3[0] = sx2 - f4[0];
+        f3[1] = sy2 - f4[1];
+        f3[2] = sz2 - f4[2];
+
+        // only right bond crossing the plane
+        if (right_cross && !middle_cross && !left_cross)
+        {
+          double sgn = copysign(1.0, x_atom_1[dir] - pos);
+          df[0] = sgn * f1[0];
+          df[1] = sgn * f1[1];
+          df[2] = sgn * f1[2];
+        }
+
+        // only middle bond crossing the plane
+        if (!right_cross && middle_cross && !left_cross)
+        {
+          double sgn = copysign(1.0, x_atom_2[dir] - pos);
+          df[0] = sgn * (f2[0] + f1[0]);
+          df[1] = sgn * (f2[1] + f1[1]);
+          df[2] = sgn * (f2[2] + f1[2]);
+        }
+
+        // only left bond crossing the plane
+        if (!right_cross && !middle_cross && left_cross)
+        {
+          double sgn = copysign(1.0, x_atom_4[dir] - pos);
+          df[0] = sgn * f4[0];
+          df[1] = sgn * f4[1];
+          df[2] = sgn * f4[2];
+        }
+
+        // only right & middle bonds crossing the plane
+        if (right_cross && middle_cross && !left_cross)
+        {
+          double sgn = copysign(1.0, x_atom_2[dir] - pos);
+          df[0] = sgn * f2[0];
+          df[1] = sgn * f2[1];
+          df[2] = sgn * f2[2];
+        }
+
+        // only right & left bonds crossing the plane
+        if (right_cross && !middle_cross && left_cross)
+        {
+          double sgn = copysign(1.0, x_atom_1[dir] - pos);
+          df[0] = sgn * (f1[0] + f4[0]);
+          df[1] = sgn * (f1[1] + f4[1]);
+          df[2] = sgn * (f1[2] + f4[2]);
+        }
+
+        // only middle & left bonds crossing the plane
+        if (!right_cross && middle_cross && left_cross)
+        {
+          double sgn = copysign(1.0, x_atom_3[dir] - pos);
+          df[0] = sgn * f3[0];
+          df[1] = sgn * f3[1];
+          df[2] = sgn * f3[2];
+        }
+
+        // all three bonds crossing the plane
+        if (right_cross && middle_cross && left_cross)
+        {
+          double sgn = copysign(1.0, x_atom_1[dir] - pos);
+          df[0] = sgn * (f1[0] + f3[0]);
+          df[1] = sgn * (f1[1] + f3[1]);
+          df[2] = sgn * (f1[2] + f3[2]);
+        }
+
+        local_contribution[ibin][0] += df[0]/area*nktv2p;
+        local_contribution[ibin][1] += df[1]/area*nktv2p;
+        local_contribution[ibin][2] += df[2]/area*nktv2p;
+      }
+    }
+  }
+
+  // loop over the keywords and if necessary add the dihedral contribution
+  int m = 0;
+  while (m < nvalues) {
+    if ((which[m] == CONF) || (which[m] == TOTAL) || (which[m] == DIHEDRAL)) {
+      for (int ibin = 0; ibin < nbins; ibin++) {
+        dihedral_local[ibin][m] = local_contribution[ibin][0];
+        dihedral_local[ibin][m+1] = local_contribution[ibin][1];
+        dihedral_local[ibin][m+2] = local_contribution[ibin][2];
+      }
+    }
+    m += 3;
+  }
+
+}
+
 /* ----------------------------------------------------------------------
   setup 1d bins and their extent and coordinates
   called at init()
@ -621,47 +1197,39 @@ void ComputeStressMopProfile::setup_bins()
  boxlo = domain->boxlo;
  boxhi = domain->boxhi;

-  if (originflag == LOWER)
-    origin = boxlo[dir];
-  else if (originflag == UPPER)
-    origin = boxhi[dir];
-  else if (originflag == CENTER)
-    origin = 0.5 * (boxlo[dir] + boxhi[dir]);
+  if ((origin > domain->boxhi[dir]) || (origin < domain->boxlo[dir]))
+    error->all(FLERR, "Origin of bins for compute stress/mop/profile is out of bounds");

-  if (origin < boxlo[dir]) {
-    error->all(FLERR, "Origin of bins for compute stress/mop/profile is out of bounds");
-  } else {
-    n = static_cast<int>((origin - boxlo[dir]) * invdelta);
-    lo = origin - n * delta;
-  }
-  if (origin < boxhi[dir]) {
-    n = static_cast<int>((boxhi[dir] - origin) * invdelta);
-    hi = origin + n * delta;
-  } else {
-    error->all(FLERR, "Origin of bins for compute stress/mop/profile is out of bounds");
-  }
+  n = static_cast<int> ((origin - boxlo[dir]) * invdelta);
+  lo = origin - n*delta;
+
+  n = static_cast<int> ((boxhi[dir] - origin) * invdelta);
+  hi = origin + n*delta;

  offset = lo;
  nbins = static_cast<int>((hi - lo) * invdelta + 1.5);

-  // allocate bin arrays
-
-  memory->create(coord, nbins, 1, "stress/mop/profile:coord");
-  memory->create(coordp, nbins, 1, "stress/mop/profile:coordp");
+  //allocate bin arrays
+  memory->create(coord, nbins, "stress/mop/profile:coord");
+  memory->create(coordp, nbins, "stress/mop/profile:coordp");
  memory->create(values_local, nbins, nvalues, "stress/mop/profile:values_local");
  memory->create(values_global, nbins, nvalues, "stress/mop/profile:values_global");
  memory->create(bond_local, nbins, nvalues, "stress/mop/profile:bond_local");
  memory->create(bond_global, nbins, nvalues, "stress/mop/profile:bond_global");
+  memory->create(angle_local, nbins, nvalues, "stress/mop/profile:angle_local");
+  memory->create(angle_global, nbins, nvalues, "stress/mop/profile:angle_global");
+  memory->create(dihedral_local,nbins,nvalues,"stress/mop/profile:dihedral_local");
+  memory->create(dihedral_global,nbins,nvalues,"stress/mop/profile:dihedral_global");
  memory->create(local_contribution, nbins, 3, "stress/mop/profile:local_contribution");

  // set bin coordinates

  for (i = 0; i < nbins; i++) {
-    coord[i][0] = offset + i * delta;
-    if (coord[i][0] < (domain->boxlo[dir] + domain->prd_half[dir])) {
-      coordp[i][0] = coord[i][0] + domain->prd[dir];
+    coord[i] = offset + i * delta;
+    if (coord[i] < (domain->boxlo[dir] + domain->prd_half[dir])) {
+      coordp[i] = coord[i] + domain->prd[dir];
    } else {
-      coordp[i][0] = coord[i][0] - domain->prd[dir];
+      coordp[i] = coord[i] - domain->prd[dir];
    }
  }
 }
--- a/src/EXTRA-COMPUTE/compute_stress_mop_profile.h
+++ b/src/EXTRA-COMPUTE/compute_stress_mop_profile.h
@ -39,19 +39,22 @@ class ComputeStressMopProfile : public Compute {
 private:
  void compute_pairs();
  void compute_bonds();
+  void compute_angles();
+  void compute_dihedrals();
  void setup_bins();

  int nvalues, dir;
  int *which;

-  int bondflag;
+  int bondflag, angleflag, dihedralflag;

-  int originflag;
  double origin, delta, offset, invdelta;
  int nbins;
-  double **coord, **coordp;
+  double *coord, *coordp;
  double **values_local, **values_global;
  double **bond_local, **bond_global;
+  double **angle_local, **angle_global;
+  double **dihedral_local, **dihedral_global;
  double **local_contribution;

  double dt, nktv2p, ftm2v;
--- a/src/EXTRA-FIX/fix_ave_correlate_long.cpp
+++ b/src/EXTRA-FIX/fix_ave_correlate_long.cpp
@ -503,7 +503,7 @@ void FixAveCorrelateLong::end_of_step()
    if (overwrite) {
      bigint fileend = platform::ftell(fp);
      if ((fileend > 0) && (platform::ftruncate(fp,fileend)))
-        error->warning(FLERR,"Error while tuncating output: {}", utils::getsyserror());
+        error->warning(FLERR,"Error while truncating output: {}", utils::getsyserror());
    }
  }
 }
@ -728,7 +728,7 @@ double FixAveCorrelateLong::memory_usage() {
 void FixAveCorrelateLong::write_restart(FILE *fp) {
  if (comm->me == 0) {
    int nsize = 3*npair*numcorrelators*p + 2*npair*numcorrelators
-                + numcorrelators*p + 2*numcorrelators + 6;
+                + numcorrelators*p + 2*numcorrelators + 7;
    int n=0;
    double *list;
    memory->create(list,nsize,"correlator:list");
@ -736,6 +736,7 @@ void FixAveCorrelateLong::write_restart(FILE *fp) {
    list[n++] = numcorrelators;
    list[n++] = p;
    list[n++] = m;
+    list[n++] = kmax;
    list[n++] = last_accumulated_step;
    for (int i=0; i < npair; i++)
      for (int j=0; j < numcorrelators; j++) {
@ -771,6 +772,7 @@ void FixAveCorrelateLong::restart(char *buf)
  int numcorrelatorsin = static_cast<int> (list[n++]);
  int pin = static_cast<int>(list[n++]);
  int min = static_cast<int>(list[n++]);
+  kmax = static_cast<int>(list[n++]);
  last_accumulated_step = static_cast<int>(list[n++]);

  if ((npairin!=npair) || (numcorrelatorsin!=numcorrelators) || (pin!=(int)p) || (min!=(int)m))
--- a/src/EXTRA-FIX/fix_nonaffine_displacement.cpp
+++ b/src/EXTRA-FIX/fix_nonaffine_displacement.cpp
@ -71,6 +71,7 @@ FixNonaffineDisplacement::FixNonaffineDisplacement(LAMMPS *lmp, int narg, char *
  nevery = utils::inumeric(FLERR, arg[3], false, lmp);
  if (nevery <= 0) error->all(FLERR,"Illegal nevery value {} in fix nonaffine/displacement", nevery);

+  reference_timestep = update_timestep = offset_timestep = -1;
  int iarg = 4;
  if (strcmp(arg[iarg], "integrated") == 0) {
    nad_style = INTEGRATED;
@ -99,7 +100,7 @@ FixNonaffineDisplacement::FixNonaffineDisplacement(LAMMPS *lmp, int narg, char *
  if (strcmp(arg[iarg], "fixed") == 0) {
    reference_style = FIXED;
    reference_timestep = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
-    if (update_timestep < 0)
+    if (reference_timestep < 0)
      error->all(FLERR, "Illegal reference timestep {} in fix nonaffine/displacement", arg[iarg + 1]);
  } else if (strcmp(arg[iarg], "update") == 0) {
    reference_style = UPDATE;
--- a/src/EXTRA-FIX/fix_nonaffine_displacement.h
+++ b/src/EXTRA-FIX/fix_nonaffine_displacement.h
@ -32,7 +32,7 @@ class FixNonaffineDisplacement : public Fix {
  void post_constructor() override;
  void init() override;
  void init_list(int, class NeighList *) override;
-  void setup(int);
+  void setup(int) override;
  void post_force(int) override;
  void write_restart(FILE *fp) override;
  void restart(char *buf) override;
@ -62,7 +62,7 @@ class FixNonaffineDisplacement : public Fix {
  void calculate_D2Min();
  void save_reference_state();
  void minimum_image0(double *);
-  void grow_arrays(int);
+  void grow_arrays(int) override;
 };

 }    // namespace LAMMPS_NS
--- a/src/EXTRA-MOLECULE/angle_cosine_periodic.cpp
+++ b/src/EXTRA-MOLECULE/angle_cosine_periodic.cpp
@ -38,7 +38,10 @@ using namespace MathSpecial;

 /* ---------------------------------------------------------------------- */

-AngleCosinePeriodic::AngleCosinePeriodic(LAMMPS *lmp) : Angle(lmp) {}
+AngleCosinePeriodic::AngleCosinePeriodic(LAMMPS *lmp) : Angle(lmp)
+{
+  born_matrix_enable = 1;
+}

 /* ---------------------------------------------------------------------- */

@ -298,3 +301,38 @@ double AngleCosinePeriodic::single(int type, int i1, int i2, int i3)
  c = cos(acos(c)*multiplicity[type]);
  return 2.0*k[type]*(1.0-b[type]*powsign(multiplicity[type])*c);
 }
+
+/* ---------------------------------------------------------------------- */
+
+void AngleCosinePeriodic::born_matrix(int type, int i1, int i2, int i3, double &du, double &du2)
+{
+  double **x = atom->x;
+
+  double delx1 = x[i1][0] - x[i2][0];
+  double dely1 = x[i1][1] - x[i2][1];
+  double delz1 = x[i1][2] - x[i2][2];
+  domain->minimum_image(delx1,dely1,delz1);
+  double r1 = sqrt(delx1*delx1 + dely1*dely1 + delz1*delz1);
+
+  double delx2 = x[i3][0] - x[i2][0];
+  double dely2 = x[i3][1] - x[i2][1];
+  double delz2 = x[i3][2] - x[i2][2];
+  domain->minimum_image(delx2,dely2,delz2);
+  double r2 = sqrt(delx2*delx2 + dely2*dely2 + delz2*delz2);
+
+  double c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+  c /= r1*r2;
+  if (c > 1.0) c = 1.0;
+  if (c < -1.0) c = -1.0;
+  double theta = acos(c);
+
+  double s = sqrt(1.0 - c*c);
+  if (s < SMALL) s = SMALL;
+  s = 1.0/s;
+
+  double m_angle = multiplicity[type] * theta;
+  double prefactor = -2.0 * k[type] * b[type] * powsign(multiplicity[type]) * multiplicity[type];
+
+  du = prefactor * sin(m_angle) / s;
+  du2 = prefactor * (c * sin(m_angle) - s * cos(m_angle) * multiplicity[type]) / (s * s * s);
+}
--- a/src/EXTRA-MOLECULE/angle_cosine_periodic.h
+++ b/src/EXTRA-MOLECULE/angle_cosine_periodic.h
@ -35,6 +35,7 @@ class AngleCosinePeriodic : public Angle {
  void read_restart(FILE *) override;
  void write_data(FILE *) override;
  double single(int, int, int, int) override;
+  void born_matrix(int type, int i1, int i2, int i3, double &du, double &du2) override;

 protected:
  double *k;
--- a/src/EXTRA-MOLECULE/angle_fourier.cpp
+++ b/src/EXTRA-MOLECULE/angle_fourier.cpp
@ -39,6 +39,7 @@ using namespace MathConst;

 AngleFourier::AngleFourier(LAMMPS *lmp) : Angle(lmp)
 {
+  born_matrix_enable = 1;
  k = nullptr;
  C0 = nullptr;
  C1 = nullptr;
--- a/src/EXTRA-MOLECULE/angle_quartic.cpp
+++ b/src/EXTRA-MOLECULE/angle_quartic.cpp
@ -37,7 +37,10 @@ using namespace MathConst;

 /* ---------------------------------------------------------------------- */

-AngleQuartic::AngleQuartic(LAMMPS *lmp) : Angle(lmp) {}
+AngleQuartic::AngleQuartic(LAMMPS *lmp) : Angle(lmp)
+{
+  born_matrix_enable = 1;
+}

 /* ---------------------------------------------------------------------- */

@ -286,3 +289,39 @@ double AngleQuartic::single(int type, int i1, int i2, int i3)
  double dtheta4 = dtheta3 * dtheta;
  return k2[type] * dtheta2 + k3[type] * dtheta3 + k4[type] * dtheta4;
 }
+
+/* ---------------------------------------------------------------------- */
+
+void AngleQuartic::born_matrix(int type, int i1, int i2, int i3, double &du, double &du2)
+{
+  double **x = atom->x;
+
+  double delx1 = x[i1][0] - x[i2][0];
+  double dely1 = x[i1][1] - x[i2][1];
+  double delz1 = x[i1][2] - x[i2][2];
+  domain->minimum_image(delx1,dely1,delz1);
+  double r1 = sqrt(delx1*delx1 + dely1*dely1 + delz1*delz1);
+
+  double delx2 = x[i3][0] - x[i2][0];
+  double dely2 = x[i3][1] - x[i2][1];
+  double delz2 = x[i3][2] - x[i2][2];
+  domain->minimum_image(delx2,dely2,delz2);
+  double r2 = sqrt(delx2*delx2 + dely2*dely2 + delz2*delz2);
+
+  double c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+  c /= r1*r2;
+  if (c > 1.0) c = 1.0;
+  if (c < -1.0) c = -1.0;
+  double theta = acos(c);
+
+  double s = sqrt(1.0 - c*c);
+  if (s < SMALL) s = SMALL;
+
+  double dtheta = theta - theta0[type];
+  double dtheta2 = dtheta * dtheta;
+  double dtheta3 = dtheta2 * dtheta;
+
+  du = -(2.0 * k2[type] * dtheta + 3.0 * k3[type] * dtheta2 + 4.0 * k4[type] * dtheta3) / s;
+  du2 = (2.0 * k2[type] + 6.0 * k3[type] * dtheta + 12.0 * k4[type] * dtheta2) / (s*s) -
+          (2.0 * k2[type] * dtheta + 3.0 * k3[type] * dtheta2 + 4.0 * k4[type] * dtheta3) * c / (s*s*s);
+}
--- a/src/EXTRA-MOLECULE/angle_quartic.h
+++ b/src/EXTRA-MOLECULE/angle_quartic.h
@ -35,6 +35,7 @@ class AngleQuartic : public Angle {
  void read_restart(FILE *) override;
  void write_data(FILE *) override;
  double single(int, int, int, int) override;
+  void born_matrix(int type, int i1, int i2, int i3, double &du, double &du2) override;

 protected:
  double *k2, *k3, *k4, *theta0;
--- a/src/EXTRA-MOLECULE/bond_gaussian.cpp
+++ b/src/EXTRA-MOLECULE/bond_gaussian.cpp
@ -35,6 +35,7 @@ BondGaussian::BondGaussian(LAMMPS *lmp) :
    Bond(lmp), nterms(nullptr), bond_temperature(nullptr), alpha(nullptr), width(nullptr),
    r0(nullptr)
 {
+  born_matrix_enable = 1;
 }

 /* ---------------------------------------------------------------------- */
@ -294,3 +295,45 @@ double BondGaussian::single(int type, double rsq, int /*i*/, int /*j*/, double &

  return -(force->boltz * bond_temperature[type]) * log(sum_g_i);
 }
+
+/* ---------------------------------------------------------------------- */
+
+void BondGaussian::born_matrix(int type, double rsq, int /*i*/, int /*j*/, double &du, double &du2)
+{
+  double r = sqrt(rsq);
+
+  // first derivative of energy with respect to distance
+  double sum_g_i = 0.0;
+  double sum_numerator = 0.0;
+  for (int i = 0; i < nterms[type]; i++) {
+    double dr = r - r0[type][i];
+    double prefactor = (alpha[type][i] / (width[type][i] * sqrt(MY_PI2)));
+    double exponent = -2 * dr * dr / (width[type][i] * width[type][i]);
+    double g_i = prefactor * exp(exponent);
+    sum_g_i += g_i;
+    sum_numerator += g_i * dr / (width[type][i] * width[type][i]);
+  }
+
+  if (sum_g_i < SMALL) sum_g_i = SMALL;
+  du = 4.0 * (force->boltz * bond_temperature[type]) * (sum_numerator / sum_g_i);
+
+  // second derivative of energy with respect to distance
+  sum_g_i = 0.0;
+  double sum_dg_i = 0.0;
+  double sum_d2g_i = 0.0;
+  for (int i = 0; i < nterms[type]; i++) {
+    double dr = r - r0[type][i];
+    double prefactor = (alpha[type][i] / (width[type][i] * sqrt(MY_PI2)));
+    double exponent = -2 * dr * dr / (width[type][i] * width[type][i]);
+    double g_i = prefactor * exp(exponent);
+    sum_g_i += g_i;
+    sum_dg_i -= 4.0 * g_i * dr / pow(width[type][i], 2);
+    sum_d2g_i += 4.0 *  g_i * (4.0 * pow(r0[type][i], 2) - 8.0 * r0[type][i] * r - pow(width[type][i], 2) + 4.0 * r * r) / pow(width[type][i], 4) ;
+  }
+
+  if (sum_g_i < SMALL) sum_g_i = SMALL;
+  double numerator = sum_d2g_i*sum_g_i - sum_dg_i*sum_dg_i;
+  double denominator = sum_g_i * sum_g_i;
+
+  du2 = - (force->boltz * bond_temperature[type]) * numerator / denominator;
+}
--- a/src/EXTRA-MOLECULE/bond_gaussian.h
+++ b/src/EXTRA-MOLECULE/bond_gaussian.h
@ -35,6 +35,7 @@ class BondGaussian : public Bond {
  void read_restart(FILE *) override;
  void write_data(FILE *) override;
  double single(int, double, int, int, double &) override;
+  void born_matrix(int, double, int, int, double &, double &) override;

 protected:
  int *nterms;
--- a/src/EXTRA-MOLECULE/bond_harmonic_shift_cut.cpp
+++ b/src/EXTRA-MOLECULE/bond_harmonic_shift_cut.cpp
@ -31,7 +31,10 @@ using namespace LAMMPS_NS;

 /* ---------------------------------------------------------------------- */

-BondHarmonicShiftCut::BondHarmonicShiftCut(LAMMPS *lmp) : Bond(lmp) {}
+BondHarmonicShiftCut::BondHarmonicShiftCut(LAMMPS *lmp) : Bond(lmp)
+{
+  born_matrix_enable = 1;
+}

 /* ---------------------------------------------------------------------- */

@ -219,3 +222,19 @@ double BondHarmonicShiftCut::single(int type, double rsq, int /*i*/, int /*j*/,
  fforce = -2.0*k[type]*dr/r;
  return k[type]*(dr*dr - dr2*dr2);
 }
+
+/* ---------------------------------------------------------------------- */
+
+void BondHarmonicShiftCut::born_matrix(int type, double rsq, int /*i*/, int /*j*/, double &du, double &du2)
+{
+  du = 0.0;
+  du2 = 0.0;
+
+  double r = sqrt(rsq);
+  if (r>r1[type]) return;
+
+  double dr = r - r0[type];
+
+  du2 = 2 * k[type];
+  if (r > 0.0) du = du2 * dr;
+}
--- a/src/EXTRA-MOLECULE/bond_harmonic_shift_cut.h
+++ b/src/EXTRA-MOLECULE/bond_harmonic_shift_cut.h
@ -35,6 +35,7 @@ class BondHarmonicShiftCut : public Bond {
  void read_restart(FILE *) override;
  void write_data(FILE *) override;
  double single(int, double, int, int, double &) override;
+  void born_matrix(int, double, int, int, double &, double &) override;

 protected:
  double *k, *r0, *r1;
--- a/src/EXTRA-MOLECULE/dihedral_helix.cpp
+++ b/src/EXTRA-MOLECULE/dihedral_helix.cpp
@ -41,6 +41,7 @@ using namespace MathConst;
 DihedralHelix::DihedralHelix(LAMMPS *lmp) : Dihedral(lmp)
 {
  writedata = 1;
+  born_matrix_enable = 1;
 }

 /* ---------------------------------------------------------------------- */
@ -324,3 +325,108 @@ void DihedralHelix::write_data(FILE *fp)
  for (int i = 1; i <= atom->ndihedraltypes; i++)
    fprintf(fp,"%d %g %g %g\n",i,aphi[i],bphi[i],cphi[i]);
 }
+
+/* ----------------------------------------------------------------------*/
+
+void DihedralHelix::born_matrix(int nd, int i1, int i2, int i3, int i4,
+                                             double &du, double &du2)
+{
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,vb2xm,vb2ym,vb2zm;
+  double sb1,sb3,rb1,rb3,c0,b1mag2,b1mag,b2mag2;
+  double b2mag,b3mag2,b3mag,ctmp,r12c1,c1mag,r12c2;
+  double c2mag,sc1,sc2,s12,c;
+  double cx,cy,cz,cmag,dx,phi,si,siinv,sin2;
+
+  int **dihedrallist = neighbor->dihedrallist;
+  double **x = atom->x;
+
+  int type = dihedrallist[nd][4];
+
+  // 1st bond
+
+  vb1x = x[i1][0] - x[i2][0];
+  vb1y = x[i1][1] - x[i2][1];
+  vb1z = x[i1][2] - x[i2][2];
+
+  // 2nd bond
+
+  vb2x = x[i3][0] - x[i2][0];
+  vb2y = x[i3][1] - x[i2][1];
+  vb2z = x[i3][2] - x[i2][2];
+
+  vb2xm = -vb2x;
+  vb2ym = -vb2y;
+  vb2zm = -vb2z;
+
+  // 3rd bond
+
+  vb3x = x[i4][0] - x[i3][0];
+  vb3y = x[i4][1] - x[i3][1];
+  vb3z = x[i4][2] - x[i3][2];
+
+  // c0 calculation
+
+  sb1 = 1.0 / (vb1x*vb1x + vb1y*vb1y + vb1z*vb1z);
+  sb3 = 1.0 / (vb3x*vb3x + vb3y*vb3y + vb3z*vb3z);
+
+  rb1 = sqrt(sb1);
+  rb3 = sqrt(sb3);
+
+  c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
+
+  // 1st and 2nd angle
+
+  b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
+  b1mag = sqrt(b1mag2);
+  b2mag2 = vb2x*vb2x + vb2y*vb2y + vb2z*vb2z;
+  b2mag = sqrt(b2mag2);
+  b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
+  b3mag = sqrt(b3mag2);
+
+  ctmp = vb1x*vb2x + vb1y*vb2y + vb1z*vb2z;
+  r12c1 = 1.0 / (b1mag*b2mag);
+  c1mag = ctmp * r12c1;
+
+  ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
+  r12c2 = 1.0 / (b2mag*b3mag);
+  c2mag = ctmp * r12c2;
+
+  // cos and sin of 2 angles and final c
+
+  sin2 = MAX(1.0 - c1mag*c1mag,0.0);
+  sc1 = sqrt(sin2);
+  if (sc1 < SMALL) sc1 = SMALL;
+  sc1 = 1.0/sc1;
+
+  sin2 = MAX(1.0 - c2mag*c2mag,0.0);
+  sc2 = sqrt(sin2);
+  if (sc2 < SMALL) sc2 = SMALL;
+  sc2 = 1.0/sc2;
+
+  s12 = sc1 * sc2;
+  c = (c0 + c1mag*c2mag) * s12;
+
+  cx = vb1y*vb2z - vb1z*vb2y;
+  cy = vb1z*vb2x - vb1x*vb2z;
+  cz = vb1x*vb2y - vb1y*vb2x;
+  cmag = sqrt(cx*cx + cy*cy + cz*cz);
+  dx = (cx*vb3x + cy*vb3y + cz*vb3z)/cmag/b3mag;
+
+  // error check
+
+  if (c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) problem(FLERR, i1, i2, i3, i4);
+
+  if (c > 1.0) c = 1.0;
+  if (c < -1.0) c = -1.0;
+
+  phi = acos(c);
+  if (dx > 0.0) phi *= -1.0;
+  si = sin(phi);
+  if (fabs(si) < SMALLER) si = SMALLER;
+  siinv = 1.0/si;
+
+  du = -aphi[type] + 3.0*bphi[type]*sin(3.0*phi)*siinv +
+                                            cphi[type]*sin(phi + MY_PI4)*siinv;
+  du2 = -(9.0*bphi[type]*cos(3.0*phi) + cphi[type]*cos(phi + MY_PI4))*siinv*siinv +
+          (3.0*bphi[type]*sin(3.0*phi) + cphi[type]*sin(phi + MY_PI4))*c*siinv*siinv*siinv;
+}
--- a/src/EXTRA-MOLECULE/dihedral_helix.h
+++ b/src/EXTRA-MOLECULE/dihedral_helix.h
@ -33,6 +33,7 @@ class DihedralHelix : public Dihedral {
  void write_restart(FILE *) override;
  void read_restart(FILE *) override;
  void write_data(FILE *) override;
+  void born_matrix(int, int, int, int, int, double &, double &) override;

 protected:
  double *aphi, *bphi, *cphi;
--- a/src/EXTRA-MOLECULE/dihedral_quadratic.cpp
+++ b/src/EXTRA-MOLECULE/dihedral_quadratic.cpp
@ -41,6 +41,7 @@ using namespace MathConst;
 DihedralQuadratic::DihedralQuadratic(LAMMPS *lmp) : Dihedral(lmp)
 {
  writedata = 1;
+  born_matrix_enable = 1;
 }

 /* ---------------------------------------------------------------------- */
@ -327,3 +328,112 @@ void DihedralQuadratic::write_data(FILE *fp)
  for (int i = 1; i <= atom->ndihedraltypes; i++)
    fprintf(fp,"%d %g %g \n",i,k[i],phi0[i]*180.0/MY_PI);
 }
+
+/* ----------------------------------------------------------------------*/
+
+void DihedralQuadratic::born_matrix(int nd, int i1, int i2, int i3, int i4,
+                             double &du, double &du2)
+{
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,vb2xm,vb2ym,vb2zm;
+  double sb1,sb3,rb1,rb3,c0,b1mag2,b1mag,b2mag2;
+  double b2mag,b3mag2,b3mag,ctmp,r12c1,c1mag,r12c2;
+  double c2mag,sc1,sc2,s12,c;
+  double s1,s2,cx,cy,cz,cmag,dx,phi,si,siinv,sin2;
+
+  int **dihedrallist = neighbor->dihedrallist;
+  double **x = atom->x;
+
+  int type = dihedrallist[nd][4];
+
+  // 1st bond
+
+  vb1x = x[i1][0] - x[i2][0];
+  vb1y = x[i1][1] - x[i2][1];
+  vb1z = x[i1][2] - x[i2][2];
+
+  // 2nd bond
+
+  vb2x = x[i3][0] - x[i2][0];
+  vb2y = x[i3][1] - x[i2][1];
+  vb2z = x[i3][2] - x[i2][2];
+
+  vb2xm = -vb2x;
+  vb2ym = -vb2y;
+  vb2zm = -vb2z;
+
+  // 3rd bond
+  vb3x = x[i4][0] - x[i3][0];
+  vb3y = x[i4][1] - x[i3][1];
+  vb3z = x[i4][2] - x[i3][2];
+
+  // c0 calculation
+
+  sb1 = 1.0 / (vb1x*vb1x + vb1y*vb1y + vb1z*vb1z);
+  sb3 = 1.0 / (vb3x*vb3x + vb3y*vb3y + vb3z*vb3z);
+
+  rb1 = sqrt(sb1);
+  rb3 = sqrt(sb3);
+
+  c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
+
+  // 1st and 2nd angle
+
+  b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
+  b1mag = sqrt(b1mag2);
+  b2mag2 = vb2x*vb2x + vb2y*vb2y + vb2z*vb2z;
+  b2mag = sqrt(b2mag2);
+  b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
+  b3mag = sqrt(b3mag2);
+
+  ctmp = vb1x*vb2x + vb1y*vb2y + vb1z*vb2z;
+  r12c1 = 1.0 / (b1mag*b2mag);
+  c1mag = ctmp * r12c1;
+
+  ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
+  r12c2 = 1.0 / (b2mag*b3mag);
+  c2mag = ctmp * r12c2;
+
+  // cos and sin of 2 angles and final c
+
+  sin2 = MAX(1.0 - c1mag*c1mag,0.0);
+  sc1 = sqrt(sin2);
+  if (sc1 < SMALL) sc1 = SMALL;
+  sc1 = 1.0/sc1;
+
+  sin2 = MAX(1.0 - c2mag*c2mag,0.0);
+  sc2 = sqrt(sin2);
+  if (sc2 < SMALL) sc2 = SMALL;
+  sc2 = 1.0/sc2;
+
+  s1 = sc1 * sc1;
+  s2 = sc2 * sc2;
+  s12 = sc1 * sc2;
+  c = (c0 + c1mag*c2mag) * s12;
+
+  cx = vb1y*vb2z - vb1z*vb2y;
+  cy = vb1z*vb2x - vb1x*vb2z;
+  cz = vb1x*vb2y - vb1y*vb2x;
+  cmag = sqrt(cx*cx + cy*cy + cz*cz);
+  dx = (cx*vb3x + cy*vb3y + cz*vb3z)/cmag/b3mag;
+
+  // error check
+
+  if (c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE))
+                  problem(FLERR, i1, i2, i3, i4);
+
+  if (c > 1.0) c = 1.0;
+  if (c < -1.0) c = -1.0;
+
+  phi = acos(c);
+  if (dx > 0.0) phi *= -1.0;
+  si = sin(phi);
+  if (fabs(si) < SMALLER) si = SMALLER;
+  siinv = 1.0/si;
+
+  double dphi = phi-phi0[type];
+  if (dphi > MY_PI) dphi -= 2*MY_PI;
+  else if (dphi < -MY_PI) dphi += 2*MY_PI;
+
+  du = - 2.0 * k[type] * dphi * siinv;
+  du2 = 2.0 * k[type] * siinv * siinv * ( 1.0 - dphi * c * siinv) ;
+}
--- a/src/EXTRA-MOLECULE/dihedral_quadratic.h
+++ b/src/EXTRA-MOLECULE/dihedral_quadratic.h
@ -33,6 +33,7 @@ class DihedralQuadratic : public Dihedral {
  void write_restart(FILE *) override;
  void read_restart(FILE *) override;
  void write_data(FILE *) override;
+  void born_matrix(int, int, int, int, int, double &, double &) override;

 protected:
  double *k, *phi0;
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
@ -405,7 +405,8 @@ void PPPMGPU::poisson_ik()

  // if requested, compute energy and virial contribution

-  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  double scaleinv = 1.0 / ngridtotal;
  double s2 = scaleinv*scaleinv;

  if (eflag_global || vflag_global) {
--- a/src/INTEL/fix_intel.cpp
+++ b/src/INTEL/fix_intel.cpp
@ -553,6 +553,9 @@ void FixIntel::kspace_init_check()

  if (intel_pair == 0)
    error->all(FLERR,"Intel styles for kspace require intel pair style.");
+
+  if (utils::strmatch(update->integrate_style, "^verlet/split"))
+    error->all(FLERR,"Intel styles for kspace are not compatible with run_style verlet/split");
 }

 /* ---------------------------------------------------------------------- */
--- a/src/INTEL/npair_full_bin_ghost_intel.cpp
+++ b/src/INTEL/npair_full_bin_ghost_intel.cpp
@ -16,7 +16,7 @@
   Contributing authors: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */

-#include "npair_full_bin_ghost_intel.h"
+#include "npair_bin_ghost_intel.h"

 #include "atom.h"
 #include "comm.h"
--- a/src/INTEL/npair_full_bin_ghost_intel.h
+++ b/src/INTEL/npair_full_bin_ghost_intel.h
@ -25,8 +25,8 @@ NPairStyle(full/bin/ghost/intel,
 // clang-format on
 #else

-#ifndef LMP_NPAIR_FULL_BIN_GHOST_INTEL_H
-#define LMP_NPAIR_FULL_BIN_GHOST_INTEL_H
+#ifndef LMP_NPAIR_BIN_GHOST_INTEL_H
+#define LMP_NPAIR_BIN_GHOST_INTEL_H

 #include "npair_intel.h"

--- a/src/INTEL/npair_bin_intel.cpp
+++ b/src/INTEL/npair_bin_intel.cpp
@ -0,0 +1,298 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "npair_bin_intel.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "neigh_list.h"
+#include "neighbor.h"
+
+using namespace LAMMPS_NS;
+
+
+/* ---------------------------------------------------------------------- */
+
+NPairHalfBinNewtonIntel::NPairHalfBinNewtonIntel(LAMMPS *lmp) :
+  NPairIntel(lmp) {}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with full Newton's 3rd law
+   each owned atom i checks its own bin and other bins in Newton stencil
+   every pair stored exactly once by some processor
+------------------------------------------------------------------------- */
+
+void NPairHalfBinNewtonIntel::build(NeighList *list)
+{
+  if (nstencil / 2 > INTEL_MAX_STENCIL_CHECK)
+    error->all(FLERR, "Too many neighbor bins for INTEL package.");
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (exclude)
+    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
+  #endif
+
+  if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
+    hbni(list, _fix->get_mixed_buffers());
+  else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    hbni(list, _fix->get_double_buffers());
+  else
+    hbni(list, _fix->get_single_buffers());
+
+  _fix->stop_watch(TIME_HOST_NEIGHBOR);
+}
+
+template <class flt_t, class acc_t>
+void NPairHalfBinNewtonIntel::
+hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  list->inum = nlocal;
+
+  int host_start = _fix->host_start_neighbor();
+  const int off_end = _fix->offload_end_neighbor();
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (off_end) grow_stencil();
+  if (_fix->full_host_list()) host_start = 0;
+  int offload_noghost = _fix->offload_noghost();
+  #endif
+
+  buffers->grow_list(list, atom->nlocal, comm->nthreads, 0, off_end);
+
+  int need_ic = 0;
+  if (atom->molecular != Atom::ATOMIC)
+    dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
+                         neighbor->cutneighmax);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (need_ic) {
+    if (offload_noghost) {
+      bin_newton<flt_t,acc_t,1,1,0,0,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,1,1,0,0,0>(0, list, buffers, host_start, nlocal,
+                                        off_end);
+    } else {
+      bin_newton<flt_t,acc_t,0,1,0,0,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
+    }
+  } else {
+    if (offload_noghost) {
+      bin_newton<flt_t,acc_t,1,0,0,0,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,1,0,0,0,0>(0, list, buffers, host_start, nlocal,
+                                        off_end);
+    } else {
+      bin_newton<flt_t,acc_t,0,0,0,0,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal);
+    }
+  }
+  #else
+  if (need_ic)
+    bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
+  else
+    bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal);
+  #endif
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+NPairHalfBinNewtonTriIntel::NPairHalfBinNewtonTriIntel(LAMMPS *lmp) :
+  NPairIntel(lmp) {}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with Newton's 3rd law for triclinic
+   each owned atom i checks its own bin and other bins in triclinic stencil
+   every pair stored exactly once by some processor
+------------------------------------------------------------------------- */
+
+void NPairHalfBinNewtonTriIntel::build(NeighList *list)
+{
+  if (nstencil > INTEL_MAX_STENCIL)
+    error->all(FLERR, "Too many neighbor bins for INTEL package.");
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (exclude)
+    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
+  #endif
+
+  if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
+    hbnti(list, _fix->get_mixed_buffers());
+  else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    hbnti(list, _fix->get_double_buffers());
+  else
+    hbnti(list, _fix->get_single_buffers());
+
+  _fix->stop_watch(TIME_HOST_NEIGHBOR);
+}
+
+template <class flt_t, class acc_t>
+void NPairHalfBinNewtonTriIntel::
+hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  list->inum = nlocal;
+
+  int host_start = _fix->host_start_neighbor();
+  const int off_end = _fix->offload_end_neighbor();
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (off_end) grow_stencil();
+  if (_fix->full_host_list()) host_start = 0;
+  int offload_noghost = _fix->offload_noghost();
+  #endif
+
+  buffers->grow_list(list, atom->nlocal, comm->nthreads, 0, off_end);
+
+  int need_ic = 0;
+  if (atom->molecular != Atom::ATOMIC)
+    dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
+                         neighbor->cutneighmax);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (need_ic) {
+    if (offload_noghost) {
+      bin_newton<flt_t,acc_t,1,1,0,1,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,1,1,0,1,0>(0, list, buffers, host_start, nlocal,
+                                        off_end);
+    } else {
+      bin_newton<flt_t,acc_t,0,1,0,1,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal);
+    }
+  } else {
+    if (offload_noghost) {
+      bin_newton<flt_t,acc_t,1,0,0,1,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,1,0,0,1,0>(0, list, buffers, host_start, nlocal,
+                                        off_end);
+    } else {
+      bin_newton<flt_t,acc_t,0,0,0,1,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal);
+    }
+  }
+  #else
+  if (need_ic)
+    bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal);
+  else
+    bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal);
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+NPairFullBinIntel::NPairFullBinIntel(LAMMPS *lmp) : NPairIntel(lmp) {}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction for all neighbors
+   every neighbor pair appears in list of both atoms i and j
+------------------------------------------------------------------------- */
+
+void NPairFullBinIntel::build(NeighList *list)
+{
+  if (nstencil > INTEL_MAX_STENCIL_CHECK)
+    error->all(FLERR, "Too many neighbor bins for INTEL package.");
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (exclude)
+    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
+  #endif
+
+  if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
+    fbi(list, _fix->get_mixed_buffers());
+  else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    fbi(list, _fix->get_double_buffers());
+  else
+    fbi(list, _fix->get_single_buffers());
+
+  _fix->stop_watch(TIME_HOST_NEIGHBOR);
+}
+
+template <class flt_t, class acc_t>
+void NPairFullBinIntel::
+fbi(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  list->inum = nlocal;
+  list->gnum = 0;
+
+  int host_start = _fix->host_start_neighbor();;
+  const int off_end = _fix->offload_end_neighbor();
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (off_end) grow_stencil();
+  if (_fix->full_host_list()) host_start = 0;
+  int offload_noghost = _fix->offload_noghost();
+  #endif
+
+  buffers->grow_list(list, atom->nlocal, comm->nthreads,
+                     _fix->three_body_neighbor(), off_end,
+                     _fix->nbor_pack_width());
+
+  int need_ic = 0;
+  if (atom->molecular != Atom::ATOMIC)
+    dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
+                         neighbor->cutneighmax);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_fix->three_body_neighbor()) {
+    if (need_ic) {
+      if (offload_noghost) {
+        bin_newton<flt_t,acc_t,1,1,1,0,1>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,1,1,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
+      } else {
+        bin_newton<flt_t,acc_t,0,1,1,0,1>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal);
+      }
+    } else {
+      if (offload_noghost) {
+        bin_newton<flt_t,acc_t,1,0,1,0,1>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,1,0,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
+      } else {
+        bin_newton<flt_t,acc_t,0,0,1,0,1>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal);
+      }
+    }
+  } else {
+    if (need_ic) {
+      if (offload_noghost) {
+        bin_newton<flt_t,acc_t,1,1,1,0,0>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,1,1,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
+      } else {
+        bin_newton<flt_t,acc_t,0,1,1,0,0>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal);
+      }
+    } else {
+      if (offload_noghost) {
+        bin_newton<flt_t,acc_t,1,0,1,0,0>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,1,0,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
+      } else {
+        bin_newton<flt_t,acc_t,0,0,1,0,0>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal);
+      }
+    }
+  }
+  #else
+  if (_fix->three_body_neighbor()) {
+    if (need_ic)
+      bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal);
+    else
+      bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal);
+  } else {
+    if (need_ic)
+      bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal);
+    else
+      bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal);
+  }
+  #endif
+}
--- a/src/INTEL/npair_half_bin_newton_tri_intel.h
+++ b/src/INTEL/npair_half_bin_newton_tri_intel.h
@ -14,20 +14,38 @@

 #ifdef NPAIR_CLASS
 // clang-format off
+NPairStyle(half/bin/newton/intel,
+           NPairHalfBinNewtonIntel,
+           NP_HALF | NP_BIN | NP_NEWTON | NP_ORTHO | NP_INTEL);
+
 NPairStyle(half/bin/newton/tri/intel,
           NPairHalfBinNewtonTriIntel,
           NP_HALF | NP_BIN | NP_NEWTON | NP_TRI | NP_INTEL);
+
+NPairStyle(full/bin/intel,
+           NPairFullBinIntel,
+           NP_FULL | NP_BIN | NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI |
+           NP_INTEL);
 // clang-format on
 #else

-#ifndef LMP_NPAIR_HALF_BIN_NEWTON_INTEL_TRI_H
-#define LMP_NPAIR_HALF_BIN_NEWTON_INTEL_TRI_H
+#ifndef LMP_NPAIR_BIN_INTEL_H
+#define LMP_NPAIR_BIN_INTEL_H

 #include "fix_intel.h"
 #include "npair_intel.h"

 namespace LAMMPS_NS {

+class NPairHalfBinNewtonIntel : public NPairIntel {
+ public:
+  NPairHalfBinNewtonIntel(class LAMMPS *);
+  void build(class NeighList *) override;
+
+ private:
+  template <class flt_t, class acc_t> void hbni(NeighList *, IntelBuffers<flt_t, acc_t> *);
+};
+
 class NPairHalfBinNewtonTriIntel : public NPairIntel {
 public:
  NPairHalfBinNewtonTriIntel(class LAMMPS *);
@ -37,6 +55,15 @@ class NPairHalfBinNewtonTriIntel : public NPairIntel {
  template <class flt_t, class acc_t> void hbnti(NeighList *, IntelBuffers<flt_t, acc_t> *);
 };

+class NPairFullBinIntel : public NPairIntel {
+ public:
+  NPairFullBinIntel(class LAMMPS *);
+  void build(class NeighList *) override;
+
+ private:
+  template <class flt_t, class acc_t> void fbi(NeighList *, IntelBuffers<flt_t, acc_t> *);
+};
+
 }    // namespace LAMMPS_NS

 #endif
--- a/src/INTEL/npair_full_bin_intel.h
+++ b/src/INTEL/npair_full_bin_intel.h
@ -1,44 +0,0 @@
-// clang-format off
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://www.lammps.org/, Sandia National Laboratories
-   LAMMPS development team: developers@lammps.org
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-#ifdef NPAIR_CLASS
-// clang-format off
-NPairStyle(full/bin/intel,
-           NPairFullBinIntel,
-           NP_FULL | NP_BIN | NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI |
-           NP_INTEL);
-// clang-format on
-#else
-
-#ifndef LMP_NPAIR_FULL_BIN_INTEL_H
-#define LMP_NPAIR_FULL_BIN_INTEL_H
-
-#include "fix_intel.h"
-#include "npair_intel.h"
-
-namespace LAMMPS_NS {
-
-class NPairFullBinIntel : public NPairIntel {
- public:
-  NPairFullBinIntel(class LAMMPS *);
-  void build(class NeighList *) override;
-
- private:
-  template <class flt_t, class acc_t> void fbi(NeighList *, IntelBuffers<flt_t, acc_t> *);
-};
-
-}    // namespace LAMMPS_NS
-
-#endif
-#endif
--- a/src/INTEL/npair_half_bin_newton_intel.cpp
+++ b/src/INTEL/npair_half_bin_newton_intel.cpp
@ -1,108 +0,0 @@
-// clang-format off
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://www.lammps.org/, Sandia National Laboratories
-   LAMMPS development team: developers@lammps.org
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
-
-#include "npair_half_bin_newton_intel.h"
-
-#include "atom.h"
-#include "comm.h"
-#include "error.h"
-#include "neigh_list.h"
-#include "neighbor.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-NPairHalfBinNewtonIntel::NPairHalfBinNewtonIntel(LAMMPS *lmp) :
-  NPairIntel(lmp) {}
-
-/* ----------------------------------------------------------------------
-   binned neighbor list construction with full Newton's 3rd law
-   each owned atom i checks its own bin and other bins in Newton stencil
-   every pair stored exactly once by some processor
------------------------------------------------------------------------- */
-
-void NPairHalfBinNewtonIntel::build(NeighList *list)
-{
-  if (nstencil / 2 > INTEL_MAX_STENCIL_CHECK)
-    error->all(FLERR, "Too many neighbor bins for INTEL package.");
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (exclude)
-    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
-  #endif
-
-  if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
-    hbni(list, _fix->get_mixed_buffers());
-  else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
-    hbni(list, _fix->get_double_buffers());
-  else
-    hbni(list, _fix->get_single_buffers());
-
-  _fix->stop_watch(TIME_HOST_NEIGHBOR);
-}
-
-template <class flt_t, class acc_t>
-void NPairHalfBinNewtonIntel::
-hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
-  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
-  list->inum = nlocal;
-
-  int host_start = _fix->host_start_neighbor();
-  const int off_end = _fix->offload_end_neighbor();
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (off_end) grow_stencil();
-  if (_fix->full_host_list()) host_start = 0;
-  int offload_noghost = _fix->offload_noghost();
-  #endif
-
-  buffers->grow_list(list, atom->nlocal, comm->nthreads, 0, off_end);
-
-  int need_ic = 0;
-  if (atom->molecular != Atom::ATOMIC)
-    dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
-                         neighbor->cutneighmax);
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (need_ic) {
-    if (offload_noghost) {
-      bin_newton<flt_t,acc_t,1,1,0,0,0>(1, list, buffers, 0, off_end);
-      bin_newton<flt_t,acc_t,1,1,0,0,0>(0, list, buffers, host_start, nlocal,
-                                        off_end);
-    } else {
-      bin_newton<flt_t,acc_t,0,1,0,0,0>(1, list, buffers, 0, off_end);
-      bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
-    }
-  } else {
-    if (offload_noghost) {
-      bin_newton<flt_t,acc_t,1,0,0,0,0>(1, list, buffers, 0, off_end);
-      bin_newton<flt_t,acc_t,1,0,0,0,0>(0, list, buffers, host_start, nlocal,
-                                        off_end);
-    } else {
-      bin_newton<flt_t,acc_t,0,0,0,0,0>(1, list, buffers, 0, off_end);
-      bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal);
-    }
-  }
-  #else
-  if (need_ic)
-    bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
-  else
-    bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal);
-  #endif
-}
--- a/src/INTEL/npair_half_bin_newton_intel.h
+++ b/src/INTEL/npair_half_bin_newton_intel.h
@ -1,43 +0,0 @@
-// clang-format off
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://www.lammps.org/, Sandia National Laboratories
-   LAMMPS development team: developers@lammps.org
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-#ifdef NPAIR_CLASS
-// clang-format off
-NPairStyle(half/bin/newton/intel,
-           NPairHalfBinNewtonIntel,
-           NP_HALF | NP_BIN | NP_NEWTON | NP_ORTHO | NP_INTEL);
-// clang-format on
-#else
-
-#ifndef LMP_NPAIR_HALF_BIN_NEWTON_INTEL_H
-#define LMP_NPAIR_HALF_BIN_NEWTON_INTEL_H
-
-#include "fix_intel.h"
-#include "npair_intel.h"
-
-namespace LAMMPS_NS {
-
-class NPairHalfBinNewtonIntel : public NPairIntel {
- public:
-  NPairHalfBinNewtonIntel(class LAMMPS *);
-  void build(class NeighList *) override;
-
- private:
-  template <class flt_t, class acc_t> void hbni(NeighList *, IntelBuffers<flt_t, acc_t> *);
-};
-
-}    // namespace LAMMPS_NS
-
-#endif
-#endif
--- a/src/INTEL/npair_half_bin_newton_tri_intel.cpp
+++ b/src/INTEL/npair_half_bin_newton_tri_intel.cpp
@ -1,108 +0,0 @@
-// clang-format off
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://www.lammps.org/, Sandia National Laboratories
-   LAMMPS development team: developers@lammps.org
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
-
-#include "npair_half_bin_newton_tri_intel.h"
-
-#include "atom.h"
-#include "comm.h"
-#include "error.h"
-#include "neigh_list.h"
-#include "neighbor.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-NPairHalfBinNewtonTriIntel::NPairHalfBinNewtonTriIntel(LAMMPS *lmp) :
-  NPairIntel(lmp) {}
-
-/* ----------------------------------------------------------------------
-   binned neighbor list construction with Newton's 3rd law for triclinic
-   each owned atom i checks its own bin and other bins in triclinic stencil
-   every pair stored exactly once by some processor
------------------------------------------------------------------------- */
-
-void NPairHalfBinNewtonTriIntel::build(NeighList *list)
-{
-  if (nstencil > INTEL_MAX_STENCIL)
-    error->all(FLERR, "Too many neighbor bins for INTEL package.");
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (exclude)
-    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
-  #endif
-
-  if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
-    hbnti(list, _fix->get_mixed_buffers());
-  else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
-    hbnti(list, _fix->get_double_buffers());
-  else
-    hbnti(list, _fix->get_single_buffers());
-
-  _fix->stop_watch(TIME_HOST_NEIGHBOR);
-}
-
-template <class flt_t, class acc_t>
-void NPairHalfBinNewtonTriIntel::
-hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
-  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
-  list->inum = nlocal;
-
-  int host_start = _fix->host_start_neighbor();
-  const int off_end = _fix->offload_end_neighbor();
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (off_end) grow_stencil();
-  if (_fix->full_host_list()) host_start = 0;
-  int offload_noghost = _fix->offload_noghost();
-  #endif
-
-  buffers->grow_list(list, atom->nlocal, comm->nthreads, 0, off_end);
-
-  int need_ic = 0;
-  if (atom->molecular != Atom::ATOMIC)
-    dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
-                         neighbor->cutneighmax);
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (need_ic) {
-    if (offload_noghost) {
-      bin_newton<flt_t,acc_t,1,1,0,1,0>(1, list, buffers, 0, off_end);
-      bin_newton<flt_t,acc_t,1,1,0,1,0>(0, list, buffers, host_start, nlocal,
-                                        off_end);
-    } else {
-      bin_newton<flt_t,acc_t,0,1,0,1,0>(1, list, buffers, 0, off_end);
-      bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal);
-    }
-  } else {
-    if (offload_noghost) {
-      bin_newton<flt_t,acc_t,1,0,0,1,0>(1, list, buffers, 0, off_end);
-      bin_newton<flt_t,acc_t,1,0,0,1,0>(0, list, buffers, host_start, nlocal,
-                                        off_end);
-    } else {
-      bin_newton<flt_t,acc_t,0,0,0,1,0>(1, list, buffers, 0, off_end);
-      bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal);
-    }
-  }
-  #else
-  if (need_ic)
-    bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal);
-  else
-    bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal);
-  #endif
-}
--- a/src/INTEL/npair_halffull_trim_newton_intel.cpp
+++ b/src/INTEL/npair_halffull_trim_newton_intel.cpp
@ -13,10 +13,10 @@
 ------------------------------------------------------------------------- */

 /* ----------------------------------------------------------------------
-   Contributing author: Stan Moore (SNL)
+   Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */

-#include "npair_halffull_trim_newton_intel.h"
+#include "npair_halffull_intel.h"

 #include "atom.h"
 #include "comm.h"
@ -31,6 +31,232 @@ using namespace LAMMPS_NS;

 /* ---------------------------------------------------------------------- */

+NPairHalffullNewtonIntel::NPairHalffullNewtonIntel(LAMMPS *lmp) : NPair(lmp) {
+  _fix = static_cast<FixIntel *>(modify->get_fix_by_id("package_intel"));
+  if (!_fix) error->all(FLERR, "The 'package intel' command is required for /intel styles");
+}
+
+/* ----------------------------------------------------------------------
+   build half list from full list
+   pair stored once if i,j are both owned and i < j
+   if j is ghost, only store if j coords are "above and to the right" of i
+   works if full list is a skip list
+------------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void NPairHalffullNewtonIntel::build_t(NeighList *list,
+                                       IntelBuffers<flt_t,acc_t> *buffers)
+{
+  const int inum_full = list->listfull->inum;
+  const int nlocal = atom->nlocal;
+  const int e_nall = nlocal + atom->nghost;
+  const ATOM_T * _noalias const x = buffers->get_x();
+  int * _noalias const ilist = list->ilist;
+  int * _noalias const numneigh = list->numneigh;
+  int ** _noalias const firstneigh = list->firstneigh;
+  const int * _noalias const ilist_full = list->listfull->ilist;
+  const int * _noalias const numneigh_full = list->listfull->numneigh;
+  const int ** _noalias const firstneigh_full = (const int ** const)list->listfull->firstneigh;  // NOLINT
+
+  const double delta = 0.01 * force->angstrom;
+  const int triclinic = domain->triclinic;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  #endif
+  {
+    int tid, ifrom, ito;
+    IP_PRE_omp_range_id(ifrom, ito, tid, inum_full, comm->nthreads);
+
+    // each thread has its own page allocator
+    MyPage<int> &ipage = list->ipage[tid];
+    ipage.reset();
+
+    // loop over parent full list
+    for (int ii = ifrom; ii < ito; ii++) {
+      int n = 0;
+      int *neighptr = ipage.vget();
+
+      const int i = ilist_full[ii];
+      const flt_t xtmp = x[i].x;
+      const flt_t ytmp = x[i].y;
+      const flt_t ztmp = x[i].z;
+
+      // loop over full neighbor list
+
+      const int * _noalias const jlist = firstneigh_full[i];
+      const int jnum = numneigh_full[i];
+
+      if (!triclinic) {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int joriginal = jlist[jj];
+          const int j = joriginal & NEIGHMASK;
+          int addme = 1;
+          if (j < nlocal) {
+            if (i > j) addme = 0;
+          } else {
+            if (x[j].z < ztmp) addme = 0;
+            if (x[j].z == ztmp) {
+              if (x[j].y < ytmp) addme = 0;
+              if (x[j].y == ytmp && x[j].x < xtmp) addme = 0;
+            }
+          }
+          if (addme)
+            neighptr[n++] = joriginal;
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int joriginal = jlist[jj];
+          const int j = joriginal & NEIGHMASK;
+          int addme = 1;
+          if (j < nlocal) {
+            if (i > j) addme = 0;
+          } else {
+            if (fabs(x[j].z-ztmp) > delta) {
+              if (x[j].z < ztmp) addme = 0;
+            } else if (fabs(x[j].y-ytmp) > delta) {
+              if (x[j].y < ytmp) addme = 0;
+            } else {
+              if (x[j].x < xtmp) addme = 0;
+            }
+          }
+          if (addme)
+            neighptr[n++] = joriginal;
+        }
+      }
+
+      ilist[ii] = i;
+      firstneigh[i] = neighptr;
+      numneigh[i] = n;
+
+      int pad_end = n;
+      IP_PRE_neighbor_pad(pad_end, 0);
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
+              avg=INTEL_COMPILE_WIDTH/2
+      #endif
+      for ( ; n < pad_end; n++)
+        neighptr[n] = e_nall;
+
+      ipage.vgot(n);
+      if (ipage.status())
+        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+    }
+  }
+  list->inum = inum_full;
+}
+
+/* ----------------------------------------------------------------------
+   build half list from full 3-body list
+   half list is already stored as first part of 3-body list
+------------------------------------------------------------------------- */
+
+template <class flt_t>
+void NPairHalffullNewtonIntel::build_t3(NeighList *list, int *numhalf)
+{
+  const int inum_full = list->listfull->inum;
+  const int e_nall = atom->nlocal + atom->nghost;
+  int * _noalias const ilist = list->ilist;
+  int * _noalias const numneigh = list->numneigh;
+  int ** _noalias const firstneigh = list->firstneigh;
+  const int * _noalias const ilist_full = list->listfull->ilist;
+  const int * _noalias const numneigh_full = numhalf;
+  const int ** _noalias const firstneigh_full = (const int ** const)list->listfull->firstneigh;  // NOLINT
+
+  int packthreads = 1;
+  if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel if (packthreads > 1)
+  #endif
+  {
+    int tid, ifrom, ito;
+    IP_PRE_omp_range_id(ifrom, ito, tid, inum_full, packthreads);
+
+    // each thread has its own page allocator
+    MyPage<int> &ipage = list->ipage[tid];
+    ipage.reset();
+
+    // loop over parent full list
+    for (int ii = ifrom; ii < ito; ii++) {
+      int n = 0;
+      int *neighptr = ipage.vget();
+
+      const int i = ilist_full[ii];
+
+      // loop over full neighbor list
+
+      const int * _noalias const jlist = firstneigh_full[i];
+      const int jnum = numneigh_full[ii];
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
+      for (int jj = 0; jj < jnum; jj++) {
+        const int joriginal = jlist[jj];
+        neighptr[n++] = joriginal;
+      }
+
+      ilist[ii] = i;
+      firstneigh[i] = neighptr;
+      numneigh[i] = n;
+
+      int pad_end = n;
+      IP_PRE_neighbor_pad(pad_end, 0);
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
+              avg=INTEL_COMPILE_WIDTH/2
+      #endif
+      for ( ; n < pad_end; n++)
+        neighptr[n] = e_nall;
+
+      ipage.vgot(n);
+      if (ipage.status())
+        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+    }
+  }
+  list->inum = inum_full;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NPairHalffullNewtonIntel::build(NeighList *list)
+{
+  if (_fix->three_body_neighbor() == 0 || domain->triclinic) {
+    if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
+      build_t(list, _fix->get_mixed_buffers());
+    else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+      build_t(list, _fix->get_double_buffers());
+    else
+      build_t(list, _fix->get_single_buffers());
+  } else {
+    int *nhalf, *cnum;
+    if (_fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      _fix->get_mixed_buffers()->get_list_data3(list->listfull, nhalf, cnum);
+      build_t3<float>(list, nhalf);
+    } else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      _fix->get_double_buffers()->get_list_data3(list->listfull, nhalf, cnum);
+      build_t3<double>(list, nhalf);
+    } else {
+      _fix->get_single_buffers()->get_list_data3(list->listfull, nhalf, cnum);
+      build_t3<float>(list, nhalf);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
 NPairHalffullTrimNewtonIntel::NPairHalffullTrimNewtonIntel(LAMMPS *lmp) : NPair(lmp) {
  _fix = static_cast<FixIntel *>(modify->get_fix_by_id("package_intel"));
  if (!_fix) error->all(FLERR, "The 'package intel' command is required for /intel styles");
--- a/src/INTEL/npair_halffull_intel.h
+++ b/src/INTEL/npair_halffull_intel.h
@ -0,0 +1,128 @@
+// clang-format off
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+// For Newton off, only used for hybrid to generate list for non-intel style.
+// Use standard routines.
+
+#ifdef NPAIR_CLASS
+// clang-format off
+NPairStyle(halffull/newton/intel,
+           NPairHalffullNewtonIntel,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_TRI| NP_INTEL);
+
+NPairStyle(halffull/newton/skip/intel,
+           NPairHalffullNewtonIntel,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_TRI | NP_SKIP | NP_INTEL);
+
+NPairStyle(halffull/newtoff/intel,
+           NPairHalffullNewtoff,
+           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
+           NP_ORTHO | NP_TRI | NP_INTEL);
+
+NPairStyle(halffull/newtoff/skip/intel,
+           NPairHalffullNewtoff,
+           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
+           NP_ORTHO | NP_TRI | NP_SKIP | NP_INTEL);
+
+NPairStyle(halffull/newtoff/ghost/intel,
+           NPairHalffullNewtoff,
+           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
+           NP_ORTHO | NP_TRI | NP_GHOST | NP_INTEL);
+
+NPairStyle(halffull/newtoff/skip/ghost/intel,
+           NPairHalffullNewtoff,
+           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
+           NP_ORTHO | NP_TRI | NP_SKIP | NP_GHOST | NP_INTEL);
+
+
+NPairStyle(halffull/trim/newton/intel,
+           NPairHalffullTrimNewtonIntel,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_TRI| NP_TRIM | NP_INTEL);
+
+NPairStyle(halffull/trim/newton/skip/intel,
+           NPairHalffullTrimNewtonIntel,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_TRI | NP_SKIP | NP_TRIM | NP_INTEL);
+
+NPairStyle(halffull/trim/newtoff/intel,
+           NPairHalffullTrimNewtoff,
+           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
+           NP_ORTHO | NP_TRI | NP_TRIM | NP_INTEL);
+
+NPairStyle(halffull/trim/newtoff/skip/intel,
+           NPairHalffullTrimNewtoff,
+           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
+           NP_ORTHO | NP_TRI | NP_SKIP | NP_TRIM | NP_INTEL);
+
+NPairStyle(halffull/trim/newtoff/ghost/intel,
+           NPairHalffullTrimNewtoff,
+           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
+           NP_ORTHO | NP_TRI | NP_GHOST | NP_TRIM | NP_INTEL);
+
+NPairStyle(halffull/trim/newtoff/skip/ghost/intel,
+           NPairHalffullTrimNewtoff,
+           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
+           NP_ORTHO | NP_TRI | NP_SKIP | NP_GHOST | NP_TRIM | NP_INTEL);
+// clang-format on
+#else
+
+#ifndef LMP_NPAIR_HALFFULL_INTEL_H
+#define LMP_NPAIR_HALFFULL_INTEL_H
+
+#include "fix_intel.h"
+#include "npair.h"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+namespace LAMMPS_NS {
+
+class NPairHalffullNewtonIntel : public NPair {
+ public:
+  NPairHalffullNewtonIntel(class LAMMPS *);
+  void build(class NeighList *) override;
+
+ protected:
+  FixIntel *_fix;
+
+  template <class flt_t, class acc_t> void build_t(NeighList *, IntelBuffers<flt_t, acc_t> *);
+
+  template <class flt_t> void build_t3(NeighList *, int *);
+};
+
+class NPairHalffullTrimNewtonIntel : public NPair {
+ public:
+  NPairHalffullTrimNewtonIntel(class LAMMPS *);
+  void build(class NeighList *) override;
+
+ protected:
+  FixIntel *_fix;
+
+  template <class flt_t, class acc_t> void build_t(NeighList *, IntelBuffers<flt_t, acc_t> *);
+
+  template <class flt_t, class acc_t> void build_t3(NeighList *, int *, IntelBuffers<flt_t, acc_t> *);
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
--- a/src/INTEL/npair_halffull_newtoff_intel.h
+++ b/src/INTEL/npair_halffull_newtoff_intel.h
@ -1,44 +0,0 @@
-// clang-format off
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://www.lammps.org/, Sandia National Laboratories
-   LAMMPS development team: developers@lammps.org
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
-
-// Only used for hybrid to generate list for non-intel style. Use
-// standard routines.
-
-#ifdef NPAIR_CLASS
-// clang-format off
-NPairStyle(halffull/newtoff/intel,
-           NPairHalffullNewtoff,
-           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
-           NP_ORTHO | NP_TRI | NP_INTEL);
-
-NPairStyle(halffull/newtoff/skip/intel,
-           NPairHalffullNewtoff,
-           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
-           NP_ORTHO | NP_TRI | NP_SKIP | NP_INTEL);
-
-NPairStyle(halffull/newtoff/ghost/intel,
-           NPairHalffullNewtoff,
-           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
-           NP_ORTHO | NP_TRI | NP_GHOST | NP_INTEL);
-
-NPairStyle(halffull/newtoff/skip/ghost/intel,
-           NPairHalffullNewtoff,
-           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
-           NP_ORTHO | NP_TRI | NP_SKIP | NP_GHOST | NP_INTEL);
-// clang-format on
-#endif
--- a/src/INTEL/npair_halffull_newton_intel.cpp
+++ b/src/INTEL/npair_halffull_newton_intel.cpp
@ -1,256 +0,0 @@
-// clang-format off
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://www.lammps.org/, Sandia National Laboratories
-   LAMMPS development team: developers@lammps.org
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
-
-#include "npair_halffull_newton_intel.h"
-
-#include "atom.h"
-#include "comm.h"
-#include "domain.h"
-#include "error.h"
-#include "force.h"
-#include "modify.h"
-#include "my_page.h"
-#include "neigh_list.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-NPairHalffullNewtonIntel::NPairHalffullNewtonIntel(LAMMPS *lmp) : NPair(lmp) {
-  _fix = static_cast<FixIntel *>(modify->get_fix_by_id("package_intel"));
-  if (!_fix) error->all(FLERR, "The 'package intel' command is required for /intel styles");
-}
-
-/* ----------------------------------------------------------------------
-   build half list from full list
-   pair stored once if i,j are both owned and i < j
-   if j is ghost, only store if j coords are "above and to the right" of i
-   works if full list is a skip list
------------------------------------------------------------------------- */
-
-template <class flt_t, class acc_t>
-void NPairHalffullNewtonIntel::build_t(NeighList *list,
-                                       IntelBuffers<flt_t,acc_t> *buffers)
-{
-  const int inum_full = list->listfull->inum;
-  const int nlocal = atom->nlocal;
-  const int e_nall = nlocal + atom->nghost;
-  const ATOM_T * _noalias const x = buffers->get_x();
-  int * _noalias const ilist = list->ilist;
-  int * _noalias const numneigh = list->numneigh;
-  int ** _noalias const firstneigh = list->firstneigh;
-  const int * _noalias const ilist_full = list->listfull->ilist;
-  const int * _noalias const numneigh_full = list->listfull->numneigh;
-  const int ** _noalias const firstneigh_full = (const int ** const)list->listfull->firstneigh;  // NOLINT
-
-  const double delta = 0.01 * force->angstrom;
-  const int triclinic = domain->triclinic;
-
-  #if defined(_OPENMP)
-  #pragma omp parallel
-  #endif
-  {
-    int tid, ifrom, ito;
-    IP_PRE_omp_range_id(ifrom, ito, tid, inum_full, comm->nthreads);
-
-    // each thread has its own page allocator
-    MyPage<int> &ipage = list->ipage[tid];
-    ipage.reset();
-
-    // loop over parent full list
-    for (int ii = ifrom; ii < ito; ii++) {
-      int n = 0;
-      int *neighptr = ipage.vget();
-
-      const int i = ilist_full[ii];
-      const flt_t xtmp = x[i].x;
-      const flt_t ytmp = x[i].y;
-      const flt_t ztmp = x[i].z;
-
-      // loop over full neighbor list
-
-      const int * _noalias const jlist = firstneigh_full[i];
-      const int jnum = numneigh_full[i];
-
-      if (!triclinic) {
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
-        #pragma ivdep
-        #endif
-        for (int jj = 0; jj < jnum; jj++) {
-          const int joriginal = jlist[jj];
-          const int j = joriginal & NEIGHMASK;
-          int addme = 1;
-          if (j < nlocal) {
-            if (i > j) addme = 0;
-          } else {
-            if (x[j].z < ztmp) addme = 0;
-            if (x[j].z == ztmp) {
-              if (x[j].y < ytmp) addme = 0;
-              if (x[j].y == ytmp && x[j].x < xtmp) addme = 0;
-            }
-          }
-          if (addme)
-            neighptr[n++] = joriginal;
-        }
-      } else {
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
-        #pragma ivdep
-        #endif
-        for (int jj = 0; jj < jnum; jj++) {
-          const int joriginal = jlist[jj];
-          const int j = joriginal & NEIGHMASK;
-          int addme = 1;
-          if (j < nlocal) {
-            if (i > j) addme = 0;
-          } else {
-            if (fabs(x[j].z-ztmp) > delta) {
-              if (x[j].z < ztmp) addme = 0;
-            } else if (fabs(x[j].y-ytmp) > delta) {
-              if (x[j].y < ytmp) addme = 0;
-            } else {
-              if (x[j].x < xtmp) addme = 0;
-            }
-          }
-          if (addme)
-            neighptr[n++] = joriginal;
-        }
-      }
-
-      ilist[ii] = i;
-      firstneigh[i] = neighptr;
-      numneigh[i] = n;
-
-      int pad_end = n;
-      IP_PRE_neighbor_pad(pad_end, 0);
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma vector aligned
-      #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
-              avg=INTEL_COMPILE_WIDTH/2
-      #endif
-      for ( ; n < pad_end; n++)
-        neighptr[n] = e_nall;
-
-      ipage.vgot(n);
-      if (ipage.status())
-        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
-    }
-  }
-  list->inum = inum_full;
-}
-
-/* ----------------------------------------------------------------------
-   build half list from full 3-body list
-   half list is already stored as first part of 3-body list
------------------------------------------------------------------------- */
-
-template <class flt_t>
-void NPairHalffullNewtonIntel::build_t3(NeighList *list, int *numhalf)
-{
-  const int inum_full = list->listfull->inum;
-  const int e_nall = atom->nlocal + atom->nghost;
-  int * _noalias const ilist = list->ilist;
-  int * _noalias const numneigh = list->numneigh;
-  int ** _noalias const firstneigh = list->firstneigh;
-  const int * _noalias const ilist_full = list->listfull->ilist;
-  const int * _noalias const numneigh_full = numhalf;
-  const int ** _noalias const firstneigh_full = (const int ** const)list->listfull->firstneigh;  // NOLINT
-
-  int packthreads = 1;
-  if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;
-
-  #if defined(_OPENMP)
-  #pragma omp parallel if (packthreads > 1)
-  #endif
-  {
-    int tid, ifrom, ito;
-    IP_PRE_omp_range_id(ifrom, ito, tid, inum_full, packthreads);
-
-    // each thread has its own page allocator
-    MyPage<int> &ipage = list->ipage[tid];
-    ipage.reset();
-
-    // loop over parent full list
-    for (int ii = ifrom; ii < ito; ii++) {
-      int n = 0;
-      int *neighptr = ipage.vget();
-
-      const int i = ilist_full[ii];
-
-      // loop over full neighbor list
-
-      const int * _noalias const jlist = firstneigh_full[i];
-      const int jnum = numneigh_full[ii];
-
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma vector aligned
-      #pragma ivdep
-      #endif
-      for (int jj = 0; jj < jnum; jj++) {
-        const int joriginal = jlist[jj];
-        neighptr[n++] = joriginal;
-      }
-
-      ilist[ii] = i;
-      firstneigh[i] = neighptr;
-      numneigh[i] = n;
-
-      int pad_end = n;
-      IP_PRE_neighbor_pad(pad_end, 0);
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma vector aligned
-      #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
-              avg=INTEL_COMPILE_WIDTH/2
-      #endif
-      for ( ; n < pad_end; n++)
-        neighptr[n] = e_nall;
-
-      ipage.vgot(n);
-      if (ipage.status())
-        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
-    }
-  }
-  list->inum = inum_full;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void NPairHalffullNewtonIntel::build(NeighList *list)
-{
-  if (_fix->three_body_neighbor() == 0 || domain->triclinic) {
-    if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
-      build_t(list, _fix->get_mixed_buffers());
-    else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
-      build_t(list, _fix->get_double_buffers());
-    else
-      build_t(list, _fix->get_single_buffers());
-  } else {
-    int *nhalf, *cnum;
-    if (_fix->precision() == FixIntel::PREC_MODE_MIXED) {
-      _fix->get_mixed_buffers()->get_list_data3(list->listfull, nhalf, cnum);
-      build_t3<float>(list, nhalf);
-    } else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-      _fix->get_double_buffers()->get_list_data3(list->listfull, nhalf, cnum);
-      build_t3<double>(list, nhalf);
-    } else {
-      _fix->get_single_buffers()->get_list_data3(list->listfull, nhalf, cnum);
-      build_t3<float>(list, nhalf);
-    }
-  }
-}
--- a/src/INTEL/npair_halffull_newton_intel.h
+++ b/src/INTEL/npair_halffull_newton_intel.h
@ -1,61 +0,0 @@
-// clang-format off
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://www.lammps.org/, Sandia National Laboratories
-   LAMMPS development team: developers@lammps.org
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
-
-#ifdef NPAIR_CLASS
-// clang-format off
-NPairStyle(halffull/newton/intel,
-           NPairHalffullNewtonIntel,
-           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
-           NP_ORTHO | NP_TRI| NP_INTEL);
-
-NPairStyle(halffull/newton/skip/intel,
-           NPairHalffullNewtonIntel,
-           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
-           NP_ORTHO | NP_TRI | NP_SKIP | NP_INTEL);
-// clang-format on
-#else
-
-#ifndef LMP_NPAIR_HALFFULL_NEWTON_INTEL_H
-#define LMP_NPAIR_HALFFULL_NEWTON_INTEL_H
-
-#include "fix_intel.h"
-#include "npair.h"
-
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
-
-namespace LAMMPS_NS {
-
-class NPairHalffullNewtonIntel : public NPair {
- public:
-  NPairHalffullNewtonIntel(class LAMMPS *);
-  void build(class NeighList *) override;
-
- protected:
-  FixIntel *_fix;
-
-  template <class flt_t, class acc_t> void build_t(NeighList *, IntelBuffers<flt_t, acc_t> *);
-
-  template <class flt_t> void build_t3(NeighList *, int *);
-};
-
-}    // namespace LAMMPS_NS
-
-#endif
-#endif
--- a/src/INTEL/npair_halffull_trim_newtoff_intel.h
+++ b/src/INTEL/npair_halffull_trim_newtoff_intel.h
@ -1,44 +0,0 @@
-// clang-format off
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://www.lammps.org/, Sandia National Laboratories
-   LAMMPS development team: developers@lammps.org
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Stan Moore (SNL)
------------------------------------------------------------------------- */
-
-// Only used for hybrid to generate list for non-intel style. Use
-// standard routines.
-
-#ifdef NPAIR_CLASS
-// clang-format off
-NPairStyle(halffull/trim/newtoff/intel,
-           NPairHalffullTrimNewtoff,
-           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
-           NP_ORTHO | NP_TRI | NP_TRIM | NP_INTEL);
-
-NPairStyle(halffull/trim/newtoff/skip/intel,
-           NPairHalffullTrimNewtoff,
-           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
-           NP_ORTHO | NP_TRI | NP_TRIM | NP_SKIP | NP_INTEL);
-
-NPairStyle(halffull/trim/newtoff/ghost/intel,
-           NPairHalffullTrimNewtoff,
-           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
-           NP_ORTHO | NP_TRI | NP_TRIM | NP_GHOST | NP_INTEL);
-
-NPairStyle(halffull/trim/newtoff/skip/ghost/intel,
-           NPairHalffullTrimNewtoff,
-           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
-           NP_ORTHO | NP_TRI | NP_TRIM | NP_SKIP | NP_GHOST | NP_INTEL);
-// clang-format on
-#endif
--- a/src/INTEL/npair_halffull_trim_newton_intel.h
+++ b/src/INTEL/npair_halffull_trim_newton_intel.h
@ -1,61 +0,0 @@
-// clang-format off
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://www.lammps.org/, Sandia National Laboratories
-   LAMMPS development team: developers@lammps.org
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Stan Moore (SNL)
------------------------------------------------------------------------- */
-
-#ifdef NPAIR_CLASS
-// clang-format off
-NPairStyle(halffull/trim/newton/intel,
-           NPairHalffullTrimNewtonIntel,
-           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
-           NP_ORTHO | NP_TRI| NP_TRIM | NP_INTEL);
-
-NPairStyle(halffull/trim/newton/skip/intel,
-           NPairHalffullTrimNewtonIntel,
-           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
-           NP_ORTHO | NP_TRI | NP_SKIP | NP_TRIM | NP_INTEL);
-// clang-format on
-#else
-
-#ifndef LMP_NPAIR_HALFFULL_TRIM_NEWTON_INTEL_H
-#define LMP_NPAIR_HALFFULL_TRIM_NEWTON_INTEL_H
-
-#include "fix_intel.h"
-#include "npair.h"
-
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
-
-namespace LAMMPS_NS {
-
-class NPairHalffullTrimNewtonIntel : public NPair {
- public:
-  NPairHalffullTrimNewtonIntel(class LAMMPS *);
-  void build(class NeighList *) override;
-
- protected:
-  FixIntel *_fix;
-
-  template <class flt_t, class acc_t> void build_t(NeighList *, IntelBuffers<flt_t, acc_t> *);
-
-  template <class flt_t, class acc_t> void build_t3(NeighList *, int *, IntelBuffers<flt_t, acc_t> *);
-};
-
-}    // namespace LAMMPS_NS
-
-#endif
-#endif
--- a/src/INTEL/npair_skip_intel.cpp
+++ b/src/INTEL/npair_skip_intel.cpp
@ -13,7 +13,7 @@
 ------------------------------------------------------------------------- */

 /* ----------------------------------------------------------------------
-   Contributing author: W. Michael Brown (Intel)
+   Contributing author: W. Michael Brown (Intel), Stan Moore (SNL)
 ------------------------------------------------------------------------- */

 #include "npair_skip_intel.h"
@ -224,3 +224,244 @@ void NPairSkipIntel::build(NeighList *list)
    }
  }
 }
+
+/* ---------------------------------------------------------------------- */
+
+NPairSkipTrimIntel::NPairSkipTrimIntel(LAMMPS *lmp) : NPair(lmp) {
+  _fix = static_cast<FixIntel *>(modify->get_fix_by_id("package_intel"));
+  if (!_fix) error->all(FLERR, "The 'package intel' command is required for /intel styles");
+  _inum_starts = new int[comm->nthreads];
+  _inum_counts = new int[comm->nthreads];
+  _full_props = nullptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+NPairSkipTrimIntel::~NPairSkipTrimIntel() {
+  delete []_inum_starts;
+  delete []_inum_counts;
+  delete[] _full_props;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NPairSkipTrimIntel::copy_neighbor_info()
+{
+  NPair::copy_neighbor_info();
+  // Only need to set _full_props once; npair object deleted for changes
+  if (_full_props) return;
+  _full_props = new int[neighbor->nrequest];
+  for (int i = 0; i < neighbor->nrequest; i++)
+    _full_props[i] = neighbor->requests[i]->full;
+}
+
+/* ----------------------------------------------------------------------
+   build skip list for subset of types from parent list
+   works for half and full lists
+   works for owned (non-ghost) list, also for ghost list
+   iskip and ijskip flag which atom types and type pairs to skip
+   if ghost, also store neighbors of ghost atoms & set inum,gnum correctly
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int THREE>
+void NPairSkipTrimIntel::build_t(NeighList *list, int *numhalf, int *cnumneigh,
+                             int *numhalf_skip, IntelBuffers<flt_t,acc_t> *buffers)
+{
+  const int nlocal = atom->nlocal;
+  const int e_nall = nlocal + atom->nghost;
+  const ATOM_T * _noalias const x = buffers->get_x();
+  const int * _noalias const type = atom->type;
+  int * _noalias const ilist = list->ilist;
+  int * _noalias const numneigh = list->numneigh;
+  int ** _noalias const firstneigh = (int ** const)list->firstneigh;  // NOLINT
+  const int * _noalias const ilist_skip = list->listskip->ilist;
+  const int * _noalias const numneigh_skip = list->listskip->numneigh;
+  const int ** _noalias const firstneigh_skip = (const int ** const)list->listskip->firstneigh;  // NOLINT
+  const int * _noalias const iskip = list->iskip;
+  const int **  _noalias const ijskip = (const int ** const)list->ijskip;  // NOLINT
+
+  const flt_t cutsq_custom = cutoff_custom * cutoff_custom;
+  int num_skip = list->listskip->inum;
+  if (list->ghost) num_skip += list->listskip->gnum;
+
+  int packthreads;
+  if (comm->nthreads > INTEL_HTHREADS && THREE==0)
+    packthreads = comm->nthreads;
+  else
+    packthreads = 1;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel if (packthreads > 1)
+  #endif
+  {
+    int tid, ifrom, ito;
+    IP_PRE_omp_range_id(ifrom, ito, tid, num_skip, packthreads);
+
+    // each thread has its own page allocator
+    MyPage<int> &ipage = list->ipage[tid];
+    ipage.reset();
+
+    int my_inum = ifrom;
+    _inum_starts[tid] = ifrom;
+
+    // loop over parent full list
+    for (int ii = ifrom; ii < ito; ii++) {
+      const int i = ilist_skip[ii];
+      const int itype = type[i];
+      if (iskip[itype]) continue;
+
+      const flt_t xtmp = x[i].x;
+      const flt_t ytmp = x[i].y;
+      const flt_t ztmp = x[i].z;
+
+      int n = 0;
+      int *neighptr = ipage.vget();
+
+      // loop over parent non-skip list
+
+      const int * _noalias const jlist = firstneigh_skip[i];
+      const int jnum = numneigh_skip[i];
+
+      if (THREE) {
+        const int jnumhalf = numhalf_skip[ii];
+        for (int jj = 0; jj < jnumhalf; jj++) {
+          const int joriginal = jlist[jj];
+          const int j = joriginal & NEIGHMASK;
+
+          int addme = 1;
+          if (ijskip[itype][type[j]]) addme = 0;
+
+          // trim to shorter cutoff
+
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+          if (rsq > cutsq_custom) addme = 0;
+
+          if (addme)
+            neighptr[n++] = joriginal;
+        }
+        numhalf[my_inum] = n;
+
+        for (int jj = jnumhalf; jj < jnum; jj++) {
+          const int joriginal = jlist[jj];
+          const int j = joriginal & NEIGHMASK;
+
+          int addme = 1;
+          if (ijskip[itype][type[j]]) addme = 0;
+
+          // trim to shorter cutoff
+
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+          if (rsq > cutsq_custom) addme = 0;
+
+          if (addme)
+            neighptr[n++] = joriginal;
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int joriginal = jlist[jj];
+          const int j = joriginal & NEIGHMASK;
+
+          int addme = 1;
+          if (ijskip[itype][type[j]]) addme = 0;
+
+          // trim to shorter cutoff
+
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+          if (rsq > cutsq_custom) addme = 0;
+
+          if (addme)
+            neighptr[n++] = joriginal;
+        }
+      }
+
+      ilist[my_inum++] = i;
+      firstneigh[i] = neighptr;
+      numneigh[i] = n;
+
+      int pad_end = n;
+      IP_PRE_neighbor_pad(pad_end, 0);
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
+              avg=INTEL_COMPILE_WIDTH/2
+      #endif
+      for ( ; n < pad_end; n++)
+        neighptr[n] = e_nall;
+
+      ipage.vgot(n);
+      if (ipage.status())
+        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+    }
+
+    int last_inum = 0, loop_end;
+    _inum_counts[tid] = my_inum;
+  }
+  int inum = _inum_counts[0];
+  for (int tid = 1; tid < packthreads; tid++) {
+    for (int i = _inum_starts[tid]; i < _inum_counts[tid]; i++) {
+      if (THREE) numhalf[inum] = numhalf[i];
+      ilist[inum++] = ilist[i];
+    }
+  }
+  list->inum = inum;
+
+  if (THREE && num_skip > 0) {
+    int * const list_start = firstneigh[ilist[0]];
+    for (int ii = 0; ii < inum; ii++) {
+      int i = ilist[ii];
+      cnumneigh[ii] = static_cast<int>(firstneigh[i] - list_start);
+    }
+  }
+  if (list->ghost) {
+    int num = 0;
+    int my_inum = list->inum;
+    for (int i = 0; i < my_inum; i++)
+      if (ilist[i] < nlocal) num++;
+      else break;
+    list->inum = num;
+    list->gnum = my_inum - num;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NPairSkipTrimIntel::build(NeighList *list)
+{
+  if (_fix->three_body_neighbor()==0 ||
+      _full_props[list->listskip->index] == 0) {
+    if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
+      build_t<float,double,0>(list, nullptr, nullptr, nullptr, _fix->get_mixed_buffers());
+    else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+      build_t<double,double,0>(list, nullptr, nullptr, nullptr, _fix->get_double_buffers());
+    else
+      build_t<float,float,0>(list, nullptr, nullptr, nullptr, _fix->get_single_buffers());
+  } else {
+    int *nhalf, *cnumneigh, *nhalf_skip, *u;
+    if (_fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      _fix->get_mixed_buffers()->get_list_data3(list->listskip,nhalf_skip,u);
+      _fix->get_mixed_buffers()->grow_data3(list, nhalf, cnumneigh);
+      build_t<float,double,1>(list, nhalf, cnumneigh, nhalf_skip, _fix->get_mixed_buffers());
+    } else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      _fix->get_double_buffers()->get_list_data3(list->listskip,nhalf_skip,u);
+      _fix->get_double_buffers()->grow_data3(list, nhalf, cnumneigh);
+      build_t<double,double,1>(list, nhalf, cnumneigh, nhalf_skip, _fix->get_double_buffers());
+    } else {
+      _fix->get_single_buffers()->get_list_data3(list->listskip,nhalf_skip,u);
+      _fix->get_single_buffers()->grow_data3(list,nhalf,cnumneigh);
+      build_t<float,float,1>(list, nhalf, cnumneigh, nhalf_skip, _fix->get_single_buffers());
+    }
+  }
+}
--- a/src/INTEL/npair_skip_intel.h
+++ b/src/INTEL/npair_skip_intel.h
@ -25,6 +25,18 @@ NPairStyle(skip/ghost/intel,
           NP_SKIP | NP_HALF | NP_FULL |
           NP_NSQ | NP_BIN | NP_MULTI |
           NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_GHOST | NP_INTEL);
+
+NPairStyle(skip/trim/intel,
+           NPairSkipTrimIntel,
+           NP_SKIP | NP_HALF | NP_FULL |
+           NP_NSQ | NP_BIN | NP_MULTI |
+           NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_TRIM | NP_INTEL);
+
+NPairStyle(skip/trim/ghost/intel,
+           NPairSkipTrimIntel,
+           NP_SKIP | NP_HALF | NP_FULL |
+           NP_NSQ | NP_BIN | NP_MULTI |
+           NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_TRIM | NP_GHOST | NP_INTEL);
 // clang-format on
 #else

@ -55,6 +67,22 @@ class NPairSkipIntel : public NPair {
  void build_t(NeighList *, int *numhalf, int *cnumneigh, int *numhalf_skip);
 };

+class NPairSkipTrimIntel : public NPair {
+ public:
+  NPairSkipTrimIntel(class LAMMPS *);
+  ~NPairSkipTrimIntel() override;
+  void copy_neighbor_info() override;
+  void build(class NeighList *) override;
+
+ protected:
+  FixIntel *_fix;
+  int *_inum_starts, *_inum_counts, *_full_props;
+
+  template <class flt_t, class acc_t, int THREE>
+  void build_t(NeighList *, int *numhalf, int *cnumneigh, int *numhalf_skip,
+               IntelBuffers<flt_t, acc_t> *);
+};
+
 }    // namespace LAMMPS_NS

 #endif
--- a/src/INTEL/npair_skip_trim_intel.cpp
+++ b/src/INTEL/npair_skip_trim_intel.cpp
@ -1,271 +0,0 @@
-// clang-format off
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://www.lammps.org/, Sandia National Laboratories
-   LAMMPS development team: developers@lammps.org
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Stan Moore (SNL)
------------------------------------------------------------------------- */
-
-#include "npair_skip_trim_intel.h"
-
-#include "atom.h"
-#include "comm.h"
-#include "error.h"
-#include "modify.h"
-#include "my_page.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "neighbor.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-NPairSkipTrimIntel::NPairSkipTrimIntel(LAMMPS *lmp) : NPair(lmp) {
-  _fix = static_cast<FixIntel *>(modify->get_fix_by_id("package_intel"));
-  if (!_fix) error->all(FLERR, "The 'package intel' command is required for /intel styles");
-  _inum_starts = new int[comm->nthreads];
-  _inum_counts = new int[comm->nthreads];
-  _full_props = nullptr;
-}
-
-/* ---------------------------------------------------------------------- */
-
-NPairSkipTrimIntel::~NPairSkipTrimIntel() {
-  delete []_inum_starts;
-  delete []_inum_counts;
-  delete[] _full_props;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void NPairSkipTrimIntel::copy_neighbor_info()
-{
-  NPair::copy_neighbor_info();
-  // Only need to set _full_props once; npair object deleted for changes
-  if (_full_props) return;
-  _full_props = new int[neighbor->nrequest];
-  for (int i = 0; i < neighbor->nrequest; i++)
-    _full_props[i] = neighbor->requests[i]->full;
-}
-
-/* ----------------------------------------------------------------------
-   build skip list for subset of types from parent list
-   works for half and full lists
-   works for owned (non-ghost) list, also for ghost list
-   iskip and ijskip flag which atom types and type pairs to skip
-   if ghost, also store neighbors of ghost atoms & set inum,gnum correctly
------------------------------------------------------------------------- */
-
-template<class flt_t, class acc_t, int THREE>
-void NPairSkipTrimIntel::build_t(NeighList *list, int *numhalf, int *cnumneigh,
-                             int *numhalf_skip, IntelBuffers<flt_t,acc_t> *buffers)
-{
-  const int nlocal = atom->nlocal;
-  const int e_nall = nlocal + atom->nghost;
-  const ATOM_T * _noalias const x = buffers->get_x();
-  const int * _noalias const type = atom->type;
-  int * _noalias const ilist = list->ilist;
-  int * _noalias const numneigh = list->numneigh;
-  int ** _noalias const firstneigh = (int ** const)list->firstneigh;  // NOLINT
-  const int * _noalias const ilist_skip = list->listskip->ilist;
-  const int * _noalias const numneigh_skip = list->listskip->numneigh;
-  const int ** _noalias const firstneigh_skip = (const int ** const)list->listskip->firstneigh;  // NOLINT
-  const int * _noalias const iskip = list->iskip;
-  const int **  _noalias const ijskip = (const int ** const)list->ijskip;  // NOLINT
-
-  const flt_t cutsq_custom = cutoff_custom * cutoff_custom;
-  int num_skip = list->listskip->inum;
-  if (list->ghost) num_skip += list->listskip->gnum;
-
-  int packthreads;
-  if (comm->nthreads > INTEL_HTHREADS && THREE==0)
-    packthreads = comm->nthreads;
-  else
-    packthreads = 1;
-
-  #if defined(_OPENMP)
-  #pragma omp parallel if (packthreads > 1)
-  #endif
-  {
-    int tid, ifrom, ito;
-    IP_PRE_omp_range_id(ifrom, ito, tid, num_skip, packthreads);
-
-    // each thread has its own page allocator
-    MyPage<int> &ipage = list->ipage[tid];
-    ipage.reset();
-
-    int my_inum = ifrom;
-    _inum_starts[tid] = ifrom;
-
-    // loop over parent full list
-    for (int ii = ifrom; ii < ito; ii++) {
-      const int i = ilist_skip[ii];
-      const int itype = type[i];
-      if (iskip[itype]) continue;
-
-      const flt_t xtmp = x[i].x;
-      const flt_t ytmp = x[i].y;
-      const flt_t ztmp = x[i].z;
-
-      int n = 0;
-      int *neighptr = ipage.vget();
-
-      // loop over parent non-skip list
-
-      const int * _noalias const jlist = firstneigh_skip[i];
-      const int jnum = numneigh_skip[i];
-
-      if (THREE) {
-        const int jnumhalf = numhalf_skip[ii];
-        for (int jj = 0; jj < jnumhalf; jj++) {
-          const int joriginal = jlist[jj];
-          const int j = joriginal & NEIGHMASK;
-
-          int addme = 1;
-          if (ijskip[itype][type[j]]) addme = 0;
-
-          // trim to shorter cutoff
-
-          const flt_t delx = xtmp - x[j].x;
-          const flt_t dely = ytmp - x[j].y;
-          const flt_t delz = ztmp - x[j].z;
-          const flt_t rsq = delx * delx + dely * dely + delz * delz;
-          if (rsq > cutsq_custom) addme = 0;
-
-          if (addme)
-            neighptr[n++] = joriginal;
-        }
-        numhalf[my_inum] = n;
-
-        for (int jj = jnumhalf; jj < jnum; jj++) {
-          const int joriginal = jlist[jj];
-          const int j = joriginal & NEIGHMASK;
-
-          int addme = 1;
-          if (ijskip[itype][type[j]]) addme = 0;
-
-          // trim to shorter cutoff
-
-          const flt_t delx = xtmp - x[j].x;
-          const flt_t dely = ytmp - x[j].y;
-          const flt_t delz = ztmp - x[j].z;
-          const flt_t rsq = delx * delx + dely * dely + delz * delz;
-          if (rsq > cutsq_custom) addme = 0;
-
-          if (addme)
-            neighptr[n++] = joriginal;
-        }
-      } else {
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
-        #pragma ivdep
-        #endif
-        for (int jj = 0; jj < jnum; jj++) {
-          const int joriginal = jlist[jj];
-          const int j = joriginal & NEIGHMASK;
-
-          int addme = 1;
-          if (ijskip[itype][type[j]]) addme = 0;
-
-          // trim to shorter cutoff
-
-          const flt_t delx = xtmp - x[j].x;
-          const flt_t dely = ytmp - x[j].y;
-          const flt_t delz = ztmp - x[j].z;
-          const flt_t rsq = delx * delx + dely * dely + delz * delz;
-          if (rsq > cutsq_custom) addme = 0;
-
-          if (addme)
-            neighptr[n++] = joriginal;
-        }
-      }
-
-      ilist[my_inum++] = i;
-      firstneigh[i] = neighptr;
-      numneigh[i] = n;
-
-      int pad_end = n;
-      IP_PRE_neighbor_pad(pad_end, 0);
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma vector aligned
-      #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
-              avg=INTEL_COMPILE_WIDTH/2
-      #endif
-      for ( ; n < pad_end; n++)
-        neighptr[n] = e_nall;
-
-      ipage.vgot(n);
-      if (ipage.status())
-        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
-    }
-
-    int last_inum = 0, loop_end;
-    _inum_counts[tid] = my_inum;
-  }
-  int inum = _inum_counts[0];
-  for (int tid = 1; tid < packthreads; tid++) {
-    for (int i = _inum_starts[tid]; i < _inum_counts[tid]; i++) {
-      if (THREE) numhalf[inum] = numhalf[i];
-      ilist[inum++] = ilist[i];
-    }
-  }
-  list->inum = inum;
-
-  if (THREE && num_skip > 0) {
-    int * const list_start = firstneigh[ilist[0]];
-    for (int ii = 0; ii < inum; ii++) {
-      int i = ilist[ii];
-      cnumneigh[ii] = static_cast<int>(firstneigh[i] - list_start);
-    }
-  }
-  if (list->ghost) {
-    int num = 0;
-    int my_inum = list->inum;
-    for (int i = 0; i < my_inum; i++)
-      if (ilist[i] < nlocal) num++;
-      else break;
-    list->inum = num;
-    list->gnum = my_inum - num;
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void NPairSkipTrimIntel::build(NeighList *list)
-{
-  if (_fix->three_body_neighbor()==0 ||
-      _full_props[list->listskip->index] == 0) {
-    if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
-      build_t<float,double,0>(list, nullptr, nullptr, nullptr, _fix->get_mixed_buffers());
-    else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
-      build_t<double,double,0>(list, nullptr, nullptr, nullptr, _fix->get_double_buffers());
-    else
-      build_t<float,float,0>(list, nullptr, nullptr, nullptr, _fix->get_single_buffers());
-  } else {
-    int *nhalf, *cnumneigh, *nhalf_skip, *u;
-    if (_fix->precision() == FixIntel::PREC_MODE_MIXED) {
-      _fix->get_mixed_buffers()->get_list_data3(list->listskip,nhalf_skip,u);
-      _fix->get_mixed_buffers()->grow_data3(list, nhalf, cnumneigh);
-      build_t<float,double,1>(list, nhalf, cnumneigh, nhalf_skip, _fix->get_mixed_buffers());
-    } else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-      _fix->get_double_buffers()->get_list_data3(list->listskip,nhalf_skip,u);
-      _fix->get_double_buffers()->grow_data3(list, nhalf, cnumneigh);
-      build_t<double,double,1>(list, nhalf, cnumneigh, nhalf_skip, _fix->get_double_buffers());
-    } else {
-      _fix->get_single_buffers()->get_list_data3(list->listskip,nhalf_skip,u);
-      _fix->get_single_buffers()->grow_data3(list,nhalf,cnumneigh);
-      build_t<float,float,1>(list, nhalf, cnumneigh, nhalf_skip, _fix->get_single_buffers());
-    }
-  }
-}
--- a/src/INTEL/npair_skip_trim_intel.h
+++ b/src/INTEL/npair_skip_trim_intel.h
@ -1,62 +0,0 @@
-// clang-format off
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://www.lammps.org/, Sandia National Laboratories
-   LAMMPS development team: developers@lammps.org
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-#ifdef NPAIR_CLASS
-// clang-format off
-NPairStyle(skip/trim/intel,
-           NPairSkipTrimIntel,
-           NP_SKIP | NP_HALF | NP_FULL |
-           NP_NSQ | NP_BIN | NP_MULTI |
-           NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_TRIM | NP_INTEL);
-
-NPairStyle(skip/trim/ghost/intel,
-           NPairSkipTrimIntel,
-           NP_SKIP | NP_HALF | NP_FULL |
-           NP_NSQ | NP_BIN | NP_MULTI |
-           NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_TRIM | NP_GHOST | NP_INTEL);
-// clang-format on
-#else
-
-#ifndef LMP_NPAIR_SKIP_TRIM_INTEL_H
-#define LMP_NPAIR_SKIP_TRIM_INTEL_H
-
-#include "fix_intel.h"
-#include "npair.h"
-
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
-
-namespace LAMMPS_NS {
-
-class NPairSkipTrimIntel : public NPair {
- public:
-  NPairSkipTrimIntel(class LAMMPS *);
-  ~NPairSkipTrimIntel() override;
-  void copy_neighbor_info() override;
-  void build(class NeighList *) override;
-
- protected:
-  FixIntel *_fix;
-  int *_inum_starts, *_inum_counts, *_full_props;
-
-  template <class flt_t, class acc_t, int THREE>
-  void build_t(NeighList *, int *numhalf, int *cnumneigh, int *numhalf_skip,
-               IntelBuffers<flt_t, acc_t> *);
-};
-
-}    // namespace LAMMPS_NS
-
-#endif
-#endif
--- a/src/INTEL/nstencil_bin_intel.cpp
+++ b/src/INTEL/nstencil_bin_intel.cpp
@ -0,0 +1,70 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "nstencil_bin_intel.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template<int HALF, int DIM_3D, int TRI>
+NStencilBinIntel<HALF, DIM_3D, TRI>::NStencilBinIntel(LAMMPS *lmp) : NStencil(lmp) {}
+
+/* ----------------------------------------------------------------------
+   create stencil based on bin geometry and cutoff
+------------------------------------------------------------------------- */
+
+template<int HALF, int DIM_3D, int TRI>
+void NStencilBinIntel<HALF, DIM_3D, TRI>::create()
+{
+  int i, j, k;
+
+  // For half stencils, only the upper plane is needed
+  int sy_min = sy;
+  int sz_min = sz;
+  if ((!TRI) && HALF && (!DIM_3D)) sy_min = 0;
+  if ((!TRI) && HALF && DIM_3D) sz_min = 0;
+
+  nstencil = 0;
+
+  // For Intel, half and ortho stencils do not include central bin
+  // as, historically, this was never included in a stencil.
+  // Non-Intel npair classes were updated to account for this change,
+  // but the Intel npair classes have not yet been updated
+  // if (HALF && (!TRI)) stencil[nstencil++] = 0;
+
+  for (k = -sz_min; k <= sz; k++) {
+    for (j = -sy_min; j <= sy; j++) {
+      for (i = -sx; i <= sx; i++) {
+
+        // Now only include "upper right" bins for half and ortho stencils
+        if (HALF && (!DIM_3D) && (!TRI))
+          if (! (j > 0 || (j == 0 && i > 0))) continue;
+        if (HALF && DIM_3D && (!TRI))
+          if (! (k > 0 || j > 0 || (j == 0 && i > 0))) continue;
+
+        if (bin_distance(i, j, k) < cutneighmaxsq)
+          stencil[nstencil++] = k * mbiny * mbinx + j * mbinx + i;
+      }
+    }
+  }
+}
+
+namespace LAMMPS_NS {
+template class NStencilBinIntel<0,0,0>;
+template class NStencilBinIntel<0,1,0>;
+template class NStencilBinIntel<1,0,0>;
+template class NStencilBinIntel<1,0,1>;
+template class NStencilBinIntel<1,1,0>;
+template class NStencilBinIntel<1,1,1>;
+}
--- a/Show More
+++ b/Show More