From fd3be6176d7c40317c23cb181aba00ae87064b54 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 23 Jul 2021 08:36:23 -0400
Subject: [PATCH 01/17] remove dead code

---
 python/lammps/core.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/lammps/core.py b/python/lammps/core.py
index d981243503..c4639eee4f 100644
--- a/python/lammps/core.py
+++ b/python/lammps/core.py
@@ -1790,7 +1790,6 @@ class lammps(object):
 
     with ExceptionCheck(self):
       return self.lib.lammps_fix_external_get_force(self.lmp, fix_id.encode())
-    return None
 
   # -------------------------------------------------------------------------
 

From 594bf56ee34b069341eee38fef0b2865245cf00c Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 23 Jul 2021 12:12:40 -0400
Subject: [PATCH 02/17] silence compiler warnings on macOS

---
 unittest/formats/test_file_operations.cpp | 1 -
 unittest/utils/test_tokenizer.cpp         | 6 ++++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/unittest/formats/test_file_operations.cpp b/unittest/formats/test_file_operations.cpp
index 794222fe5a..af2023cebb 100644
--- a/unittest/formats/test_file_operations.cpp
+++ b/unittest/formats/test_file_operations.cpp
@@ -281,7 +281,6 @@ TEST_F(FileOperationsTest, error_message_warn)
 
 TEST_F(FileOperationsTest, error_all_one)
 {
-    char buf[64];
     BEGIN_HIDE_OUTPUT();
     command("echo none");
     command("log none");
diff --git a/unittest/utils/test_tokenizer.cpp b/unittest/utils/test_tokenizer.cpp
index c62a947d5c..698283ea53 100644
--- a/unittest/utils/test_tokenizer.cpp
+++ b/unittest/utils/test_tokenizer.cpp
@@ -94,7 +94,8 @@ TEST(Tokenizer, copy_constructor)
 
 TEST(Tokenizer, move_constructor)
 {
-    Tokenizer u = std::move(Tokenizer("test new word   ", " "));
+    Tokenizer t("test new word   ", " ");
+    Tokenizer u = std::move(t);
     ASSERT_THAT(u.next(), Eq("test"));
     ASSERT_THAT(u.next(), Eq("new"));
     ASSERT_THAT(u.next(), Eq("word"));
@@ -248,7 +249,8 @@ TEST(ValueTokenizer, copy_constructor)
 
 TEST(ValueTokenizer, move_constructor)
 {
-    ValueTokenizer u = std::move(ValueTokenizer("  test new word   ", " "));
+    ValueTokenizer t("  test new word   ", " ");
+    ValueTokenizer u = std::move(t);
     ASSERT_THAT(u.next_string(), Eq("test"));
     ASSERT_THAT(u.next_string(), Eq("new"));
     ASSERT_THAT(u.next_string(), Eq("word"));

From 25aa2029769f43801d28a7bce9cdec9e8cf5a35d Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 23 Jul 2021 12:12:57 -0400
Subject: [PATCH 03/17] tweak epsilon to pass reaxff unittests on macOS

---
 unittest/force-styles/tests/atomic-pair-reax_c.yaml       | 2 +-
 unittest/force-styles/tests/atomic-pair-reax_c_lgvdw.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/unittest/force-styles/tests/atomic-pair-reax_c.yaml b/unittest/force-styles/tests/atomic-pair-reax_c.yaml
index c4d41a2f2a..d5bed64ae6 100644
--- a/unittest/force-styles/tests/atomic-pair-reax_c.yaml
+++ b/unittest/force-styles/tests/atomic-pair-reax_c.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 2 Jul 2021
 date_generated: Wed Jul 21 15:49:45 2021
-epsilon: 1e-11
+epsilon: 2e-11
 prerequisites: ! |
   pair reaxff
   fix qeq/reaxff
diff --git a/unittest/force-styles/tests/atomic-pair-reax_c_lgvdw.yaml b/unittest/force-styles/tests/atomic-pair-reax_c_lgvdw.yaml
index c766c242f9..b124a6b00b 100644
--- a/unittest/force-styles/tests/atomic-pair-reax_c_lgvdw.yaml
+++ b/unittest/force-styles/tests/atomic-pair-reax_c_lgvdw.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 2 Jul 2021
 date_generated: Wed Jul 21 15:49:47 2021
-epsilon: 1e-12
+epsilon: 3e-12
 prerequisites: ! |
   pair reaxff
   fix qeq/reaxff

From c5872528948884d7f1fa6eece6dc6dd9b198bcf7 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 23 Jul 2021 12:13:50 -0400
Subject: [PATCH 04/17] rename unittest YAML files

---
 .../tests/{atomic-pair-reax_c.yaml => atomic-pair-reaxff.yaml}    | 0
 ...tomic-pair-reax_c_lgvdw.yaml => atomic-pair-reaxff_lgvdw.yaml} | 0
 ...tomic-pair-reax_c_noqeq.yaml => atomic-pair-reaxff_noqeq.yaml} | 0
 ...pair-reax_c_tabulate.yaml => atomic-pair-reaxff_tabulate.yaml} | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename unittest/force-styles/tests/{atomic-pair-reax_c.yaml => atomic-pair-reaxff.yaml} (100%)
 rename unittest/force-styles/tests/{atomic-pair-reax_c_lgvdw.yaml => atomic-pair-reaxff_lgvdw.yaml} (100%)
 rename unittest/force-styles/tests/{atomic-pair-reax_c_noqeq.yaml => atomic-pair-reaxff_noqeq.yaml} (100%)
 rename unittest/force-styles/tests/{atomic-pair-reax_c_tabulate.yaml => atomic-pair-reaxff_tabulate.yaml} (100%)

diff --git a/unittest/force-styles/tests/atomic-pair-reax_c.yaml b/unittest/force-styles/tests/atomic-pair-reaxff.yaml
similarity index 100%
rename from unittest/force-styles/tests/atomic-pair-reax_c.yaml
rename to unittest/force-styles/tests/atomic-pair-reaxff.yaml
diff --git a/unittest/force-styles/tests/atomic-pair-reax_c_lgvdw.yaml b/unittest/force-styles/tests/atomic-pair-reaxff_lgvdw.yaml
similarity index 100%
rename from unittest/force-styles/tests/atomic-pair-reax_c_lgvdw.yaml
rename to unittest/force-styles/tests/atomic-pair-reaxff_lgvdw.yaml
diff --git a/unittest/force-styles/tests/atomic-pair-reax_c_noqeq.yaml b/unittest/force-styles/tests/atomic-pair-reaxff_noqeq.yaml
similarity index 100%
rename from unittest/force-styles/tests/atomic-pair-reax_c_noqeq.yaml
rename to unittest/force-styles/tests/atomic-pair-reaxff_noqeq.yaml
diff --git a/unittest/force-styles/tests/atomic-pair-reax_c_tabulate.yaml b/unittest/force-styles/tests/atomic-pair-reaxff_tabulate.yaml
similarity index 100%
rename from unittest/force-styles/tests/atomic-pair-reax_c_tabulate.yaml
rename to unittest/force-styles/tests/atomic-pair-reaxff_tabulate.yaml

From 57270c5339609748f26a18ad05250a5a45aba981 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sat, 24 Jul 2021 13:46:13 -0400
Subject: [PATCH 05/17] set flag to tell CMake to not use/link the MPI C++
 interface in the correct place

---
 cmake/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index a5861ab885..6c446bdb1b 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -238,20 +238,20 @@ if(PKG_ADIOS)
 endif()
 
 if(NOT CMAKE_CROSSCOMPILING)
-  set(MPI_CXX_SKIP_MPICXX TRUE)
   find_package(MPI QUIET)
   option(BUILD_MPI "Build MPI version" ${MPI_FOUND})
 else()
-  set(MPI_CXX_SKIP_MPICXX TRUE)
   option(BUILD_MPI "Build MPI version" OFF)
 endif()
 
 if(BUILD_MPI)
   # We use a non-standard procedure to cross-compile with MPI on Windows
   if((CMAKE_SYSTEM_NAME STREQUAL "Windows") AND CMAKE_CROSSCOMPILING)
+    set(MPI_CXX_SKIP_MPICXX TRUE)
     include(MPI4WIN)
     target_link_libraries(lammps PUBLIC MPI::MPI_CXX)
   else()
+    set(MPI_CXX_SKIP_MPICXX TRUE)
     find_package(MPI REQUIRED)
     target_link_libraries(lammps PUBLIC MPI::MPI_CXX)
     option(LAMMPS_LONGLONG_TO_LONG "Workaround if your system or MPI version does not recognize 'long long' data types" OFF)

From 1ebd60e35e5956c6d0691bbe4098da393e9f80f6 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 25 Jul 2021 10:56:38 -0400
Subject: [PATCH 06/17] update and add explanation

---
 cmake/CMakeLists.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 6c446bdb1b..7269fc5c9b 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -245,13 +245,14 @@ else()
 endif()
 
 if(BUILD_MPI)
+  # do not include the (obsolete) MPI C++ bindings which makes
+  # for leaner object files and avoids namespace conflicts
+  set(MPI_CXX_SKIP_MPICXX TRUE)
   # We use a non-standard procedure to cross-compile with MPI on Windows
   if((CMAKE_SYSTEM_NAME STREQUAL "Windows") AND CMAKE_CROSSCOMPILING)
-    set(MPI_CXX_SKIP_MPICXX TRUE)
     include(MPI4WIN)
     target_link_libraries(lammps PUBLIC MPI::MPI_CXX)
   else()
-    set(MPI_CXX_SKIP_MPICXX TRUE)
     find_package(MPI REQUIRED)
     target_link_libraries(lammps PUBLIC MPI::MPI_CXX)
     option(LAMMPS_LONGLONG_TO_LONG "Workaround if your system or MPI version does not recognize 'long long' data types" OFF)

From 7e6a06b1cc76ed475419baf4073427bff5397185 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 25 Jul 2021 21:04:26 -0400
Subject: [PATCH 07/17] update Pizza.py URLs

---
 doc/src/Howto_viz.rst       | 2 +-
 doc/src/Python_examples.rst | 4 ++--
 doc/src/Tools.rst           | 2 +-
 doc/src/balance.rst         | 2 +-
 doc/src/dump.rst            | 2 +-
 doc/src/dump_image.rst      | 4 ++--
 doc/src/fix_balance.rst     | 2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/doc/src/Howto_viz.rst b/doc/src/Howto_viz.rst
index 2eb009c185..bcad6dddec 100644
--- a/doc/src/Howto_viz.rst
+++ b/doc/src/Howto_viz.rst
@@ -25,7 +25,7 @@ RasMol visualization programs.  Pizza.py has tools that do interactive
 3d OpenGL visualization and one that creates SVG images of dump file
 snapshots.
 
-.. _pizza: https://pizza.sandia.gov
+.. _pizza: https://lammps.github.io/pizza
 
 .. _ensight: https://www.ansys.com/products/fluids/ansys-ensight
 
diff --git a/doc/src/Python_examples.rst b/doc/src/Python_examples.rst
index 7466b52ba9..fe5fe812c5 100644
--- a/doc/src/Python_examples.rst
+++ b/doc/src/Python_examples.rst
@@ -35,9 +35,9 @@ visualization package you have installed.
 Note that for GL, you need to be able to run the Pizza.py GL tool,
 which is included in the pizza sub-directory.  See the Pizza.py doc pages for more info:
 
-* `https://pizza.sandia.gov <pizza_>`_
+* `https://lammps.github.io/pizza <pizza_>`_
 
-.. _pizza: https://pizza.sandia.gov
+.. _pizza: https://lammps.github.io/pizza
 
 Note that for AtomEye, you need version 3, and there is a line in the
 scripts that specifies the path and name of the executable.  See the
diff --git a/doc/src/Tools.rst b/doc/src/Tools.rst
index e7cd87aa78..7eb6515fad 100644
--- a/doc/src/Tools.rst
+++ b/doc/src/Tools.rst
@@ -15,7 +15,7 @@ Sandia which provides tools for doing setup, analysis, plotting, and
 visualization for LAMMPS simulations.
 
 .. _lws: https://www.lammps.org
-.. _pizza: https://pizza.sandia.gov
+.. _pizza: https://lammps.github.io/pizza
 .. _python: https://www.python.org
 
 Additional tools included in the LAMMPS distribution are described on
diff --git a/doc/src/balance.rst b/doc/src/balance.rst
index 5d42fabb21..f7ae4c6bff 100644
--- a/doc/src/balance.rst
+++ b/doc/src/balance.rst
@@ -558,7 +558,7 @@ Related commands
 :doc:`group <group>`, :doc:`processors <processors>`,
 :doc:`fix balance <fix_balance>`, :doc:`comm_style <comm_style>`
 
-.. _pizza: https://pizza.sandia.gov
+.. _pizza: https://lammps.github.io/pizza
 
 Default
 """""""
diff --git a/doc/src/dump.rst b/doc/src/dump.rst
index a5f11a792a..b55cde650e 100644
--- a/doc/src/dump.rst
+++ b/doc/src/dump.rst
@@ -230,7 +230,7 @@ individual values and the file itself.
 The *atom*\ , *local*\ , and *custom* styles create files in a simple text
 format that is self-explanatory when viewing a dump file.  Some of the
 LAMMPS post-processing tools described on the :doc:`Tools <Tools>` doc
-page, including `Pizza.py <https://pizza.sandia.gov>`_,
+page, including `Pizza.py <https://lammps.github.io/pizza>`_,
 work with this format, as does the :doc:`rerun <rerun>` command.
 
 For post-processing purposes the *atom*\ , *local*\ , and *custom* text
diff --git a/doc/src/dump_image.rst b/doc/src/dump_image.rst
index 3923d5c2dc..291aeae9c3 100644
--- a/doc/src/dump_image.rst
+++ b/doc/src/dump_image.rst
@@ -590,8 +590,8 @@ Play the movie:
      % mplayer foo.mpg
      % ffplay bar.avi
 
-* c) Use the `Pizza.py <https://pizza.sandia.gov>`_
-  `animate tool <https://pizza.sandia.gov/doc/animate.html>`_,
+* c) Use the `Pizza.py <https://lammps.github.io/pizza>`_
+  `animate tool <https://lammps.github.io/pizza/doc/animate.html>`_,
   which works directly on a series of image files.
 
   .. code-block:: python
diff --git a/doc/src/fix_balance.rst b/doc/src/fix_balance.rst
index 8bab8ebefc..772f049461 100644
--- a/doc/src/fix_balance.rst
+++ b/doc/src/fix_balance.rst
@@ -403,7 +403,7 @@ Related commands
 :doc:`group <group>`, :doc:`processors <processors>`, :doc:`balance <balance>`,
 :doc:`comm_style <comm_style>`
 
-.. _pizza: https://pizza.sandia.gov
+.. _pizza: https://lammps.github.io/pizza
 
 Default
 """""""

From ef244c3061cd37614b0d6fc6693fddeffb22dfe8 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 25 Jul 2021 21:05:33 -0400
Subject: [PATCH 08/17] small updates to the introduction

---
 doc/src/Intro_features.rst    | 23 +++++++++++++++--------
 doc/src/Intro_nonfeatures.rst |  2 +-
 doc/src/Intro_overview.rst    | 20 +++++++++++---------
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/doc/src/Intro_features.rst b/doc/src/Intro_features.rst
index 648c427b11..f1c04bdb23 100644
--- a/doc/src/Intro_features.rst
+++ b/doc/src/Intro_features.rst
@@ -24,11 +24,15 @@ General features
 ^^^^^^^^^^^^^^^^
 
 * runs on a single processor or in parallel
-* distributed-memory message-passing parallelism (MPI)
-* spatial-decomposition of simulation domain for parallelism
-* open-source distribution
-* highly portable C++
-* optional libraries used: MPI and single-processor FFT
+* distributed memory message-passing parallelism (MPI)
+* shared memory multi-threading parallelism (OpenMP)
+* spatial decomposition of simulation domain for MPI parallelism
+* particle decomposition inside of spatial decomposition for OpenMP parallelism
+* GPLv2 licensed open-source distribution
+* highly portable C++-11
+* modular code with most functionality in optional packages
+* only depends on MPI library for basic parallel functionality
+* other libraries are optional and only required for specific packages
 * GPU (CUDA and OpenCL), Intel Xeon Phi, and OpenMP support for many code features
 * easy to extend with new features and functionality
 * runs from an input script
@@ -68,9 +72,9 @@ Interatomic potentials (force fields)
 :doc:`improper style <improper_style>`, :doc:`kspace style <kspace_style>`
 commands)
 
-* pairwise potentials: Lennard-Jones, Buckingham, Morse, Born-Mayer-Huggins,     Yukawa, soft, class 2 (COMPASS), hydrogen bond, tabulated
+* pairwise potentials: Lennard-Jones, Buckingham, Morse, Born-Mayer-Huggins, Yukawa, soft, class 2 (COMPASS), hydrogen bond, tabulated
 * charged pairwise potentials: Coulombic, point-dipole
-* many-body potentials: EAM, Finnis/Sinclair EAM, modified EAM (MEAM),     embedded ion method (EIM), EDIP, ADP, Stillinger-Weber, Tersoff,     REBO, AIREBO, ReaxFF, COMB, SNAP, Streitz-Mintmire, 3-body polymorphic
+* many-body potentials: EAM, Finnis/Sinclair EAM, modified EAM (MEAM), embedded ion method (EIM), EDIP, ADP, Stillinger-Weber, Tersoff, REBO, AIREBO, ReaxFF, COMB, SNAP, Streitz-Mintmire, 3-body polymorphic
 * long-range interactions for charge, point-dipoles, and LJ dispersion:     Ewald, Wolf, PPPM (similar to particle-mesh Ewald)
 * polarization models: :doc:`QEq <fix_qeq>`,     :doc:`core/shell model <Howto_coreshell>`,     :doc:`Drude dipole model <Howto_drude>`
 * charge equilibration (QEq via dynamic, point, shielded, Slater methods)
@@ -170,9 +174,12 @@ Multi-replica models
 ^^^^^^^^^^^^^^^^^^^^
 
 * :doc:`nudged elastic band <neb>`
+* :doc:`hyperdynamics <hyper>`
 * :doc:`parallel replica dynamics <prd>`
 * :doc:`temperature accelerated dynamics <tad>`
 * :doc:`parallel tempering <temper>`
+* :doc:`path-integral MD <fix_pimd>`
+* multi-walker collective variables with :doc:`Colvars <fix_colvars>` and :doc:`Plumed <fix_plumed>`
 
 .. _prepost:
 
@@ -187,7 +194,7 @@ Pre- and post-processing
   plotting, and visualization for LAMMPS simulations.  Pizza.py is
   written in `Python <python_>`_ and is available for download from `the Pizza.py WWW site <pizza_>`_.
 
-.. _pizza: https://pizza.sandia.gov
+.. _pizza: https://lammps.github.io/pizza
 
 .. _python: http://www.python.org
 
diff --git a/doc/src/Intro_nonfeatures.rst b/doc/src/Intro_nonfeatures.rst
index d034ccb443..af406a127e 100644
--- a/doc/src/Intro_nonfeatures.rst
+++ b/doc/src/Intro_nonfeatures.rst
@@ -77,7 +77,7 @@ Here are suggestions on how to perform these tasks:
   it easier to analyze and plot.  See the :doc:`Tools <Tools>` doc page
   for more discussion of the various tools.
 * **Pizza.py:** Our group has also written a separate toolkit called
-  `Pizza.py <https://pizza.sandia.gov>`_ which can do certain kinds of
+  `Pizza.py <https://lammps.github.io/pizza>`_ which can do certain kinds of
   setup, analysis, plotting, and visualization (via OpenGL) for LAMMPS
   simulations.  It thus provides some functionality for several of the
   above bullets.  Pizza.py is written in `Python <http://www.python.org>`_
diff --git a/doc/src/Intro_overview.rst b/doc/src/Intro_overview.rst
index 8ab85a0d99..8d8d4eae83 100644
--- a/doc/src/Intro_overview.rst
+++ b/doc/src/Intro_overview.rst
@@ -18,10 +18,11 @@ supercomputers.
 .. _mpi: https://en.wikipedia.org/wiki/Message_Passing_Interface
 .. _lws: https://www.lammps.org
 
-LAMMPS is written in C++.  Earlier versions were written in F77 and
-F90.  See the `History page <https://www.lammps.org/history.html>`_ of
-the website for details.  All versions can be downloaded from the
-`LAMMPS website <lws_>`_.
+LAMMPS is written in C++ and requires a compiler that is at least
+compatible with the C++-11 standard.
+Earlier versions were written in F77 and F90.  See the `History page
+<https://www.lammps.org/history.html>`_ of the website for details.  All
+versions can be downloaded from the `LAMMPS website <lws_>`_.
 
 LAMMPS is designed to be easy to modify or extend with new
 capabilities, such as new force fields, atom types, boundary
@@ -41,8 +42,9 @@ short distances, so that the local density of particles never becomes
 too large.  This is in contrast to methods used for modeling plasma
 or gravitational bodies (e.g. galaxy formation).
 
-On parallel machines, LAMMPS uses spatial-decomposition techniques to
-partition the simulation domain into small sub-domains of equal
-computational cost, one of which is assigned to each processor.
-Processors communicate and store "ghost" atom information for atoms
-that border their sub-domain.
+On parallel machines, LAMMPS uses spatial-decomposition techniques with
+MPI parallelization to partition the simulation domain into small
+sub-domains of equal computational cost, one of which is assigned to
+each processor.  Processors communicate and store "ghost" atom
+information for atoms that border their sub-domain.  Multi-threading
+parallelization with with particle-decomposition can be used in addition.

From cca17feb27c91944ef4a8e4a2ff6e5fa6e61e5f8 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 26 Jul 2021 12:31:09 -0400
Subject: [PATCH 09/17] silence compiler warnings, remove dead code

---
 src/DIELECTRIC/pppm_disp_dielectric.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/DIELECTRIC/pppm_disp_dielectric.cpp b/src/DIELECTRIC/pppm_disp_dielectric.cpp
index bb00a74922..b6a82296b2 100644
--- a/src/DIELECTRIC/pppm_disp_dielectric.cpp
+++ b/src/DIELECTRIC/pppm_disp_dielectric.cpp
@@ -647,7 +647,6 @@ void PPPMDispDielectric::fieldforce_c_ad()
 
     // convert E-field to force and substract self forces
     const double qfactor = qqrd2e * scale;
-    double qtmp = eps[i]*q[i];
 
     s1 = x[i][0]*hx_inv;
     s2 = x[i][1]*hy_inv;
@@ -751,7 +750,7 @@ void PPPMDispDielectric::fieldforce_c_peratom()
    extended to non-neutral systems (J. Chem. Phys. 131, 094107).
 ------------------------------------------------------------------------- */
 
-void PPPMDispDielectric::slabcorr(int eflag)
+void PPPMDispDielectric::slabcorr(int /*eflag*/)
 {
   // compute local contribution to global dipole moment
 

From fefcd0e2af255056a0dc49a1604e02b9fa92a8c8 Mon Sep 17 00:00:00 2001
From: Mike Brown <michael.w.brown@intel.com>
Date: Mon, 26 Jul 2021 11:22:21 -0700
Subject: [PATCH 10/17] Converting cilk vectorization directives to the openmp
 standard and changing some more depracated vector intrinsics. Data alignment
 directives for compiler vectorization are still mostly intel specific.

---
 src/INTEL/angle_charmm_intel.cpp              |   8 +
 src/INTEL/angle_harmonic_intel.cpp            |   8 +
 src/INTEL/bond_fene_intel.cpp                 |   8 +
 src/INTEL/bond_harmonic_intel.cpp             |   8 +
 src/INTEL/dihedral_charmm_intel.cpp           |  19 +-
 src/INTEL/dihedral_fourier_intel.cpp          |   8 +
 src/INTEL/dihedral_harmonic_intel.cpp         |   8 +
 src/INTEL/dihedral_opls_intel.cpp             |   8 +
 src/INTEL/fix_intel.cpp                       |  32 ++-
 src/INTEL/fix_nh_intel.cpp                    |  66 ++++-
 src/INTEL/fix_nve_asphere_intel.cpp           |  24 +-
 src/INTEL/fix_nve_intel.cpp                   |  36 ++-
 src/INTEL/improper_cvff_intel.cpp             |  12 +
 src/INTEL/improper_harmonic_intel.cpp         |   8 +
 src/INTEL/intel_intrinsics.h                  |   3 +-
 src/INTEL/intel_intrinsics_airebo.h           |  36 +--
 src/INTEL/intel_preprocess.h                  | 200 +++++++++++++++
 src/INTEL/intel_simd.h                        | 239 +++++++++++-------
 src/INTEL/npair_full_bin_ghost_intel.cpp      |  26 +-
 src/INTEL/npair_intel.cpp                     |  40 ++-
 src/INTEL/pair_buck_coul_cut_intel.cpp        |  10 +-
 src/INTEL/pair_buck_coul_long_intel.cpp       |   9 +-
 src/INTEL/pair_buck_intel.cpp                 |  10 +-
 src/INTEL/pair_dpd_intel.cpp                  |   9 +-
 src/INTEL/pair_eam_intel.cpp                  |  39 ++-
 src/INTEL/pair_gayberne_intel.cpp             |  15 +-
 .../pair_lj_charmm_coul_charmm_intel.cpp      |   9 +-
 src/INTEL/pair_lj_charmm_coul_long_intel.cpp  |   9 +-
 src/INTEL/pair_lj_cut_coul_long_intel.cpp     |   9 +-
 src/INTEL/pair_lj_cut_intel.cpp               |   8 +-
 src/INTEL/pair_sw_intel.cpp                   |   6 +-
 src/INTEL/pppm_disp_intel.cpp                 | 214 +++++++++++++++-
 src/INTEL/pppm_intel.cpp                      |  58 ++++-
 33 files changed, 1013 insertions(+), 189 deletions(-)

diff --git a/src/INTEL/angle_charmm_intel.cpp b/src/INTEL/angle_charmm_intel.cpp
index 29b7ec208b..26943934be 100644
--- a/src/INTEL/angle_charmm_intel.cpp
+++ b/src/INTEL/angle_charmm_intel.cpp
@@ -162,7 +162,11 @@ void AngleCharmmIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -246,7 +250,11 @@ void AngleCharmmIntel::eval(const int vflag,
       // apply force to each of 3 atoms
 
       #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/angle_harmonic_intel.cpp b/src/INTEL/angle_harmonic_intel.cpp
index a2d8cc7d13..e392730edc 100644
--- a/src/INTEL/angle_harmonic_intel.cpp
+++ b/src/INTEL/angle_harmonic_intel.cpp
@@ -162,7 +162,11 @@ void AngleHarmonicIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -228,7 +232,11 @@ void AngleHarmonicIntel::eval(const int vflag,
       // apply force to each of 3 atoms
 
       #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/bond_fene_intel.cpp b/src/INTEL/bond_fene_intel.cpp
index 44a8c0d3cf..1ab8da68d9 100644
--- a/src/INTEL/bond_fene_intel.cpp
+++ b/src/INTEL/bond_fene_intel.cpp
@@ -158,7 +158,11 @@ void BondFENEIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -215,7 +219,11 @@ void BondFENEIntel::eval(const int vflag,
       // apply force to each of 2 atoms
 
       #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/bond_harmonic_intel.cpp b/src/INTEL/bond_harmonic_intel.cpp
index a37ae091a0..35b194f0fa 100644
--- a/src/INTEL/bond_harmonic_intel.cpp
+++ b/src/INTEL/bond_harmonic_intel.cpp
@@ -155,7 +155,11 @@ void BondHarmonicIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -184,7 +188,11 @@ void BondHarmonicIntel::eval(const int vflag,
 
       // apply force to each of 2 atoms
       #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/dihedral_charmm_intel.cpp b/src/INTEL/dihedral_charmm_intel.cpp
index a317be00fb..4116d9134f 100644
--- a/src/INTEL/dihedral_charmm_intel.cpp
+++ b/src/INTEL/dihedral_charmm_intel.cpp
@@ -181,9 +181,16 @@ void DihedralCharmmIntel::eval(const int vflag,
     }
 
     #if defined(LMP_SIMD_COMPILER_TEST)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
+                               sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, \
+                               spv5)
+#else
     #pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
-                           sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5)
+                           sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, \
+                           spv5)
+#endif
+    #pragma vector aligned
     for (int n = nfrom; n < nto; n++) {
     #endif
     for (int n = nfrom; n < nto; n += npl) {
@@ -329,7 +336,11 @@ void DihedralCharmmIntel::eval(const int vflag,
 
 
       #if defined(LMP_SIMD_COMPILER_TEST)
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i2 < nlocal) {
@@ -408,7 +419,11 @@ void DihedralCharmmIntel::eval(const int vflag,
 
       // apply force to each of 4 atoms
       #if defined(LMP_SIMD_COMPILER_TEST)
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/dihedral_fourier_intel.cpp b/src/INTEL/dihedral_fourier_intel.cpp
index 4d44ea36d2..d952ac7506 100644
--- a/src/INTEL/dihedral_fourier_intel.cpp
+++ b/src/INTEL/dihedral_fourier_intel.cpp
@@ -154,7 +154,11 @@ void DihedralFourierIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -304,7 +308,11 @@ void DihedralFourierIntel::eval(const int vflag,
       }
 
       #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/dihedral_harmonic_intel.cpp b/src/INTEL/dihedral_harmonic_intel.cpp
index f7009689c7..df9304b6ba 100644
--- a/src/INTEL/dihedral_harmonic_intel.cpp
+++ b/src/INTEL/dihedral_harmonic_intel.cpp
@@ -154,7 +154,11 @@ void DihedralHarmonicIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -299,7 +303,11 @@ void DihedralHarmonicIntel::eval(const int vflag,
       }
 
       #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/dihedral_opls_intel.cpp b/src/INTEL/dihedral_opls_intel.cpp
index ab007dad8c..89f06773d5 100644
--- a/src/INTEL/dihedral_opls_intel.cpp
+++ b/src/INTEL/dihedral_opls_intel.cpp
@@ -158,7 +158,11 @@ void DihedralOPLSIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -319,7 +323,11 @@ void DihedralOPLSIntel::eval(const int vflag,
       }
 
       #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/fix_intel.cpp b/src/INTEL/fix_intel.cpp
index 59eea4961a..d0633d7791 100644
--- a/src/INTEL/fix_intel.cpp
+++ b/src/INTEL/fix_intel.cpp
@@ -635,19 +635,31 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
     if (_nthreads == 4) {
       acc_t *f_scalar3 = f_scalar2 + f_stride4;
       acc_t *f_scalar4 = f_scalar3 + f_stride4;
-      _use_simd_pragma("vector aligned")
-      _use_simd_pragma("simd")
+      #if defined(USE_OMP_SIMD)
+      #pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4:64)
+      #elif defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma simd
+      #endif
       for (int n = 0; n < o_range; n++)
         f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
     } else if (_nthreads == 2) {
-      _use_simd_pragma("vector aligned")
-      _use_simd_pragma("simd")
+      #if defined(USE_OMP_SIMD)
+      #pragma omp simd aligned(f_scalar,f_scalar2:64)
+      #elif defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma simd
+      #endif
       for (int n = 0; n < o_range; n++)
         f_scalar[n] += f_scalar2[n];
     } else {
       acc_t *f_scalar3 = f_scalar2 + f_stride4;
-      _use_simd_pragma("vector aligned")
-      _use_simd_pragma("simd")
+      #if defined(USE_OMP_SIMD)
+      #pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3:64)
+      #elif defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma simd
+      #endif
       for (int n = 0; n < o_range; n++)
         f_scalar[n] += f_scalar2[n] + f_scalar3[n];
     }
@@ -662,8 +674,12 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
 
       acc_t *f_scalar2 = f_scalar + f_stride4;
       for (int t = 1; t < _nthreads; t++) {
-        _use_simd_pragma("vector aligned")
-        _use_simd_pragma("simd")
+        #if defined(USE_OMP_SIMD)
+        #pragma omp simd aligned(f_scalar,f_scalar2:64)
+        #elif defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma simd
+        #endif
         for (int n = iifrom; n < iito; n++)
           f_scalar[n] += f_scalar2[n];
         f_scalar2 += f_stride4;
diff --git a/src/INTEL/fix_nh_intel.cpp b/src/INTEL/fix_nh_intel.cpp
index 5370e3a13f..a4fdecbd96 100644
--- a/src/INTEL/fix_nh_intel.cpp
+++ b/src/INTEL/fix_nh_intel.cpp
@@ -99,8 +99,12 @@ void FixNHIntel::remap()
 
   if (allremap) {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < nlocal; i++) {
       const double d0 = x[i].x - b0;
@@ -112,8 +116,12 @@ void FixNHIntel::remap()
     }
   } else {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & dilate_group_bit) {
@@ -278,8 +286,12 @@ void FixNHIntel::remap()
 
   if (allremap) {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < nlocal; i++) {
       x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
@@ -288,8 +300,12 @@ void FixNHIntel::remap()
     }
   } else {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & dilate_group_bit) {
@@ -415,8 +431,12 @@ void FixNHIntel::nh_v_press()
 
   if (igroup == 0) {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < nlocal; i++) {
       v[i].x *= f0;
@@ -425,8 +445,12 @@ void FixNHIntel::nh_v_press()
     }
   } else {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & groupbit) {
@@ -448,8 +472,12 @@ void FixNHIntel::nve_v()
   double * _noalias const v = atom->v[0];
   const double * _noalias const f = atom->f[0];
   #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+  #pragma omp simd
+#else
   #pragma simd
+#endif
+  #pragma vector aligned
   #endif
   for (int i = 0; i < _nlocal3; i++)
     v[i] += _dtfm[i] * f[i];
@@ -468,15 +496,23 @@ void FixNHIntel::nve_x()
 
   if (igroup == 0) {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++)
       x[i] += dtv * v[i];
   } else {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++) {
       if (_dtfm[i] != 0.0)
@@ -500,15 +536,23 @@ void FixNHIntel::nh_v_temp()
 
   if (igroup == 0) {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++)
         v[i] *= factor_eta;
   } else {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++) {
       if (_dtfm[i] != 0.0)
diff --git a/src/INTEL/fix_nve_asphere_intel.cpp b/src/INTEL/fix_nve_asphere_intel.cpp
index 78504c237a..eda8b48a67 100644
--- a/src/INTEL/fix_nve_asphere_intel.cpp
+++ b/src/INTEL/fix_nve_asphere_intel.cpp
@@ -97,8 +97,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
   dtq = 0.5 * dtv;
 
   #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+  #pragma omp simd
+#else
   #pragma simd
+#endif
+  #pragma vector aligned
   #endif
   for (int i = 0; i < _nlocal3; i++) {
     v[i] += _dtfm[i] * f[i];
@@ -108,8 +112,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
   // update angular momentum by 1/2 step
   if (igroup == 0) {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < nlocal; i++) {
       double *quat = bonus[ellipsoid[i]].quat;
@@ -118,8 +126,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
     }
   } else {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & groupbit) {
@@ -143,8 +155,12 @@ void FixNVEAsphereIntel::final_integrate()
   const double * _noalias const torque = atom->torque[0];
 
   #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+  #pragma omp simd
+#else
   #pragma simd
+#endif
+  #pragma vector aligned
   #endif
   for (int i = 0; i < _nlocal3; i++) {
     v[i] += _dtfm[i] * f[i];
diff --git a/src/INTEL/fix_nve_intel.cpp b/src/INTEL/fix_nve_intel.cpp
index fb90946da0..9670af65c2 100644
--- a/src/INTEL/fix_nve_intel.cpp
+++ b/src/INTEL/fix_nve_intel.cpp
@@ -68,8 +68,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
   if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) {
     const double dtfm = dtf / atom->mass[1];
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++) {
       v[i] += dtfm * f[i];
@@ -78,8 +82,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
   } else if (igroup == 0) {
     if (neighbor->ago == 0) reset_dt();
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++) {
       v[i] += _dtfm[i] * f[i];
@@ -88,8 +96,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
   } else {
     if (neighbor->ago == 0) reset_dt();
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++) {
       if (_dtfm[i] != 0.0) {
@@ -112,16 +124,24 @@ void FixNVEIntel::final_integrate()
     _nlocal3 = 3 * atom->nlocal;
     const double dtfm = dtf / atom->mass[1];
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++)
       v[i] += dtfm * f[i];
   } else if (igroup == 0) {
     if (neighbor->ago == 0) reset_dt();
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++) {
       v[i] += _dtfm[i] * f[i];
@@ -129,8 +149,12 @@ void FixNVEIntel::final_integrate()
   } else {
     if (neighbor->ago == 0) reset_dt();
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++)
       v[i] += _dtfm[i] * f[i];
diff --git a/src/INTEL/improper_cvff_intel.cpp b/src/INTEL/improper_cvff_intel.cpp
index 62dcde36b9..4d473de7aa 100644
--- a/src/INTEL/improper_cvff_intel.cpp
+++ b/src/INTEL/improper_cvff_intel.cpp
@@ -165,7 +165,11 @@ void ImproperCvffIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -247,7 +251,11 @@ void ImproperCvffIntel::eval(const int vflag,
 
       flt_t p, pd;
       #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (m == 2) {
@@ -319,7 +327,11 @@ void ImproperCvffIntel::eval(const int vflag,
       // apply force to each of 4 atoms
 
       #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/improper_harmonic_intel.cpp b/src/INTEL/improper_harmonic_intel.cpp
index b3d4c342d9..1a637fa1a6 100644
--- a/src/INTEL/improper_harmonic_intel.cpp
+++ b/src/INTEL/improper_harmonic_intel.cpp
@@ -167,7 +167,11 @@ void ImproperHarmonicIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -276,7 +280,11 @@ void ImproperHarmonicIntel::eval(const int vflag,
       // apply force to each of 4 atoms
 
       #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/intel_intrinsics.h b/src/INTEL/intel_intrinsics.h
index 295310283d..567f04c5dc 100644
--- a/src/INTEL/intel_intrinsics.h
+++ b/src/INTEL/intel_intrinsics.h
@@ -127,7 +127,8 @@ struct vector_ops<double, KNC> {
     }
     template<int scale>
     static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) {
-      return _mm512_mask_i32logather_pd(from, mask, idx, base, scale);
+      return _mm512_mask_i32gather_pd(from, mask, _mm512_castsi512_si256(idx),
+                                      base, scale);
     }
     static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
       return _mm512_mask_blend_pd(mask, a, b);
diff --git a/src/INTEL/intel_intrinsics_airebo.h b/src/INTEL/intel_intrinsics_airebo.h
index ac58ca2438..ea29888ea1 100644
--- a/src/INTEL/intel_intrinsics_airebo.h
+++ b/src/INTEL/intel_intrinsics_airebo.h
@@ -511,7 +511,8 @@ public:
                                      const int scale) {
     assert(scale == sizeof(FVEC_SCAL_T));
 #   if FVEC_LEN==8
-    return FVEC_SUFFIX(_mm512_i32logather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
+    return FVEC_SUFFIX(_mm512_i32gather_)(_mm512_castsi512_si256(idx.val_),
+                                          mem, sizeof(FVEC_SCAL_T));
 #   else
     return FVEC_SUFFIX(_mm512_i32gather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
 #   endif
@@ -522,8 +523,8 @@ public:
   ) {
     assert(scale == sizeof(FVEC_SCAL_T));
 #   if FVEC_LEN==8
-    return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
-                       mem, sizeof(FVEC_SCAL_T));
+    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_,
+                _mm512_castsi512_si256(idx.val_), mem, sizeof(FVEC_SCAL_T));
 #   else
     return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
                        mem, sizeof(FVEC_SCAL_T));
@@ -609,8 +610,8 @@ public:
   ) {
     assert(scale == sizeof(FVEC_SCAL_T));
 #   if FVEC_LEN==8
-    return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
-                                                 mem, sizeof(FVEC_SCAL_T));
+    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_,
+              _mm512_castsi512_si256(idx.val_), mem, sizeof(FVEC_SCAL_T));
 #   else
     return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
                                                mem, sizeof(FVEC_SCAL_T));
@@ -622,8 +623,9 @@ public:
   ) {
     assert(scale == sizeof(FVEC_SCAL_T));
 #   if FVEC_LEN==8
-    FVEC_SUFFIX(_mm512_mask_i32loscatter_)(mem, mask.val_, idx.val_, a.val_,
-                                           sizeof(FVEC_SCAL_T));
+    FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, 
+                                         _mm512_castsi512_si256(idx.val_),
+                                         a.val_, sizeof(FVEC_SCAL_T));
 #   else
     FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, idx.val_, a.val_,
                                          sizeof(FVEC_SCAL_T));
@@ -666,11 +668,11 @@ public:
       const double * mem, const int scale
   ) {
     assert(scale == sizeof(double));
-    __m512d lo = _mm512_mask_i32logather_pd(src.lo_, mask.val_, idx.val_, mem,
-                                            sizeof(double));
-    __m512d hi = _mm512_mask_i32logather_pd(src.hi_, get_bvec_hi(mask.val_),
-                                            get_ivec_hi(idx.val_), mem,
-                                            sizeof(double));
+    __m512d lo = _mm512_mask_i32gather_pd(src.lo_, mask.val_, 
+                                          _mm512_castsi512_si256(idx.val_),
+                                          mem, sizeof(double));
+    __m512d hi = _mm512_mask_i32gather_pd(src.hi_, get_bvec_hi(mask.val_),
+         _mm512_castsi512_si256(get_ivec_hi(idx.val_)), mem, sizeof(double));
     return avec16pd(lo, hi);
   }
   VEC_INLINE static void mask_i32loscatter(
@@ -678,10 +680,12 @@ public:
       const avec16pd &a, const int scale
   ) {
     assert(scale == sizeof(double));
-    _mm512_mask_i32loscatter_pd(mem, mask.val_, idx.val_, a.lo_,
-                                sizeof(double));
-    _mm512_mask_i32loscatter_pd(mem, get_bvec_hi(mask.val_),
-                                get_ivec_hi(idx.val_), a.hi_, sizeof(double));
+    _mm512_mask_i32scatter_pd(mem, mask.val_,
+                              _mm512_castsi512_si256(idx.val_), a.lo_,
+                              sizeof(double));
+    _mm512_mask_i32scatter_pd(mem, get_bvec_hi(mask.val_),
+                              _mm512_castsi512_si256(get_ivec_hi(idx.val_)),
+                              a.hi_, sizeof(double));
   }
 
   #define AVEC2_BINOP(the_sym, the_name)                                    \
diff --git a/src/INTEL/intel_preprocess.h b/src/INTEL/intel_preprocess.h
index 0bec9935db..41c91d1578 100644
--- a/src/INTEL/intel_preprocess.h
+++ b/src/INTEL/intel_preprocess.h
@@ -17,8 +17,13 @@
 ------------------------------------------------------------------------- */
 
 #ifdef __INTEL_LLVM_COMPILER
+#define USE_OMP_SIMD
 #define __INTEL_COMPILER __INTEL_LLVM_COMPILER
 #define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER
+#define _MM_SCALE_1 1
+#define _MM_SCALE_2 2
+#define _MM_SCALE_4 4
+#define _MM_SCALE_8 8
 #endif
 
 #ifdef __INTEL_COMPILER
@@ -332,6 +337,9 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 
 #endif
 
+// TO BE DEPRECATED
+#ifndef USE_OMP_SIMD 
+
 #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
                                   f_stride, pos, ov0, ov1, ov2,         \
                                   ov3, ov4, ov5)                        \
@@ -526,6 +534,198 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
   }                                                                     \
 }
 
+#else
+
+#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
+                                  f_stride, pos, ov0, ov1, ov2,         \
+                                  ov3, ov4, ov5)                        \
+{                                                                       \
+  acc_t *f_scalar = &f_start[0].x;                                      \
+  flt_t *x_scalar = &pos[minlocal].x;                                   \
+  int f_stride4 = f_stride * 4;                                         \
+  _alignvar(acc_t ovv[16],64);                                          \
+  int vwidth;                                                           \
+  if (sizeof(acc_t) == sizeof(double))                                  \
+    vwidth = INTEL_COMPILE_WIDTH/2;                                     \
+  else                                                                  \
+    vwidth = INTEL_COMPILE_WIDTH;                                       \
+  if (vwidth < 4) vwidth = 4;                                           \
+  _use_simd_pragma("omp simd aligned(ovv:64)")                          \
+  for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0;                 \
+  int remainder = lt % vwidth;                                          \
+  if (lf > lt) remainder = 0;                                           \
+  const int v_range = lt - remainder;                                   \
+  if (nthreads == 2) {                                                  \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,ovv,x_scalar:64)")\
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v];                                \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n];                                      \
+  } else if (nthreads==4) {                                             \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
+    acc_t *f_scalar4 = f_scalar3 + f_stride4;                           \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4,ovv:64)") \
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] +              \
+          f_scalar4[n+v];                                               \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];        \
+  } else if (nthreads==1) {                                             \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("omp simd aligned(ovv,f_scalar,x_scalar:64)")    \
+      for (int v = 0; v < vwidth; v++)                                  \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+  } else if (nthreads==3) {                                             \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,ovv,x_scalar:64)") \
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v];               \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n] + f_scalar3[n];                       \
+  }                                                                     \
+  for (int n = v_range; n < lt; n += 4) {                               \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    for (int v = 0; v < 4; v++)                                         \
+      ovv[v] += f_scalar[n+v] * x_scalar[n+v];                          \
+    ov3 += f_scalar[n+1] * x_scalar[n+0];                               \
+    ov4 += f_scalar[n+2] * x_scalar[n+0];                               \
+    ov5 += f_scalar[n+2] * x_scalar[n+1];                               \
+  }                                                                     \
+  ov0 += ovv[0];                                                        \
+  ov1 += ovv[1];                                                        \
+  ov2 += ovv[2];                                                        \
+  if (vwidth > 4) {                                                     \
+    ov0 += ovv[4];                                                      \
+    ov1 += ovv[5];                                                      \
+    ov2 += ovv[6];                                                      \
+  }                                                                     \
+  if (vwidth > 8) {                                                     \
+    ov0 += ovv[8] + ovv[12];                                            \
+    ov1 += ovv[9] + ovv[13];                                            \
+    ov2 += ovv[10] + ovv[14];                                           \
+  }                                                                     \
+}
+
+#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,       \
+                               f_stride, pos, offload, vflag, ov0, ov1, \
+                               ov2, ov3, ov4, ov5)                      \
+{                                                                       \
+  int o_range = (nall - minlocal) * 4;                                  \
+  IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads,       \
+                            sizeof(acc_t));                             \
+                                                                        \
+  acc_t *f_scalar = &f_start[0].x;                                      \
+  int f_stride4 = f_stride * 4;                                         \
+  int t;                                                                \
+  if (vflag == VIRIAL_FDOTR) t = 4; else t = 1;                         \
+  acc_t *f_scalar2 = f_scalar + f_stride4 * t;                          \
+  for ( ; t < nthreads; t++) {                                          \
+    _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2:64)")         \
+    for (int n = iifrom; n < iito; n++)                                 \
+      f_scalar[n] += f_scalar2[n];                                      \
+    f_scalar2 += f_stride4;                                             \
+  }                                                                     \
+                                                                        \
+  if (vflag == VIRIAL_FDOTR) {                                          \
+    int nt_min = MIN(4,nthreads);                                       \
+    IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start,  \
+                              f_stride, pos, ov0, ov1, ov2, ov3, ov4,   \
+                              ov5);                                     \
+  }                                                                     \
+}
+
+#endif
+
 #ifdef _LMP_INTEL_OFFLOAD
 #include <sys/time.h>
 
diff --git a/src/INTEL/intel_simd.h b/src/INTEL/intel_simd.h
index 165455a33d..eb5d9857a5 100644
--- a/src/INTEL/intel_simd.h
+++ b/src/INTEL/intel_simd.h
@@ -173,7 +173,7 @@ namespace ip_simd {
   }
 
   inline SIMD_double SIMD_gather(const double *p, const SIMD_int &i) {
-    return _mm512_i32logather_pd(i, p, _MM_SCALE_8);
+    return _mm512_i32gather_pd(_mm512_castsi512_si256(i), p, _MM_SCALE_8);
   }
 
   inline SIMD_int SIMD_gather(const SIMD_mask &m, const int *p,
@@ -190,8 +190,8 @@ namespace ip_simd {
 
   inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p,
                                  const SIMD_int &i) {
-    return _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, p,
-                                      _MM_SCALE_8);
+    return _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), p, _MM_SCALE_8);
   }
 
   template <typename T>
@@ -227,8 +227,8 @@ namespace ip_simd {
 
   inline SIMD_double SIMD_gatherz(const SIMD_mask &m, const double *p,
                                   const SIMD_int &i) {
-    return _mm512_mask_i32logather_pd( _mm512_set1_pd(0.0), m, i, p,
-                                      _MM_SCALE_8);
+    return _mm512_mask_i32gather_pd( _mm512_set1_pd(0.0), m,
+                                     _mm512_castsi512_si256(i),p, _MM_SCALE_8);
   }
 
   // ------- Store Operations
@@ -257,7 +257,8 @@ namespace ip_simd {
 
   inline void SIMD_scatter(const SIMD_mask &m, double *p,
                            const SIMD_int &i, const SIMD_double &vec) {
-    _mm512_mask_i32loscatter_pd(p, m, i, vec, _MM_SCALE_8);
+    _mm512_mask_i32scatter_pd(p, m, _mm512_castsi512_si256(i), vec,
+                              _MM_SCALE_8);
   }
 
   // ------- Arithmetic Operations
@@ -834,23 +835,29 @@ namespace ip_simd {
   inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
                                const SIMD_int &i, SIMD_double &x,
                                SIMD_double &y, SIMD_double &z) {
-    x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
-                                   _MM_SCALE_2);
-    y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
-                                   _MM_SCALE_2);
-    z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
-                                   _MM_SCALE_2);
+    x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                 _mm512_castsi512_si256(i), atom,
+                                 _MM_SCALE_2);
+    y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                 _mm512_castsi512_si256(i), atom+1,
+                                 _MM_SCALE_2);
+    z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                 _mm512_castsi512_si256(i), atom+2,
+                                 _MM_SCALE_2);
   }
 
   inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
                                const SIMD_int &i, SIMD_double &x,
                                SIMD_double &y, SIMD_double &z, SIMD_int &type) {
-    x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
-                                   _MM_SCALE_2);
-    y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
-                                   _MM_SCALE_2);
-    z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
-                                   _MM_SCALE_2);
+    x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                 _mm512_castsi512_si256(i), atom,
+                                 _MM_SCALE_2);
+    y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                 _mm512_castsi512_si256(i), atom+1,
+                                 _MM_SCALE_2);
+    z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                 _mm512_castsi512_si256(i), atom+2,
+                                 _MM_SCALE_2);
     type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3,
                                        _MM_SCALE_2);
   }
@@ -888,10 +895,12 @@ namespace ip_simd {
                                const SIMD_int &joffset, SIMD_double &eng) {
     SIMD_double jeng;
     SIMD_conflict_pi_reduce1(rmask, joffset, eng);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
+                                    force, _MM_SCALE_2);
     jeng = jeng + eng;
-    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
+                              jeng, _MM_SCALE_2);
   }
 
   inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force,
@@ -899,20 +908,24 @@ namespace ip_simd {
     SIMD_double engd, jeng;
     engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng));
     SIMD_conflict_pi_reduce1(rmask, joffset, engd);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
+                                    force, _MM_SCALE_2);
     jeng = jeng + engd;
-    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
+                              jeng, _MM_SCALE_2);
 
     SIMD_mask rmask2 = rmask >> 8;
     engd = _mm512_cvtps_pd(_mm512_castps512_ps256(
                              _mm512_shuffle_f32x4(eng,eng,238)));
     SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
     SIMD_conflict_pi_reduce1(rmask2, joffset2, engd);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
-                                      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, 
+                                    _mm512_castsi512_si256(joffset2),
+                                    force, _MM_SCALE_2);
     jeng = jeng + engd;
-    _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jeng, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2),
+                              jeng, _MM_SCALE_2);
   }
 
   inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force,
@@ -926,10 +939,12 @@ namespace ip_simd {
 
     SIMD_double jeng;
     SIMD_conflict_pi_reduce1(rmask, joffset, eng);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
+                                    force, _MM_SCALE_2);
     jeng = jeng + eng;
-    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
+                              jeng, _MM_SCALE_2);
   }
 
   inline void SIMD_safe_jforce(const SIMD_mask &m, float *force,
@@ -956,18 +971,24 @@ namespace ip_simd {
                                SIMD_double &fy, SIMD_double &fz) {
     SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
     SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
-                                      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                    _mm512_castsi512_si256(i), force,
+                                    _MM_SCALE_2);
     jfrc = jfrc + fx;
-    _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force + 1,
+                                    _MM_SCALE_2);
     jfrc = jfrc + fy;
-    _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                    _mm512_castsi512_si256(i), force + 2,
+                                    _MM_SCALE_2);
     jfrc = jfrc + fz;
-    _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
   }
 
   inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force,
@@ -979,40 +1000,54 @@ namespace ip_simd {
     amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz));
     SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd);
     SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
+                                    force, _MM_SCALE_2);
     jfrc = jfrc + amxd;
-    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force + 1, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
+                              jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
+                                    force + 1, _MM_SCALE_2);
     jfrc = jfrc + amyd;
-    _mm512_mask_i32loscatter_pd(force+1, rmask, joffset, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force + 2, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+1, rmask, _mm512_castsi512_si256(joffset),
+                              jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
+                                    _mm512_castsi512_si256(joffset),
+                                    force + 2, _MM_SCALE_2);
     jfrc = jfrc + amzd;
-    _mm512_mask_i32loscatter_pd(force+2, rmask, joffset, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, rmask, _mm512_castsi512_si256(joffset),
+                              jfrc, _MM_SCALE_2);
 
     SIMD_mask rmask2 = rmask >> 8;
     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-                                _mm512_shuffle_f32x4(amx,amx,238)));
+                                                  _mm512_shuffle_f32x4(amx,amx,238)));
     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-                                _mm512_shuffle_f32x4(amy,amy,238)));
+                                                  _mm512_shuffle_f32x4(amy,amy,238)));
     amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-                                _mm512_shuffle_f32x4(amz,amz,238)));
+                                                  _mm512_shuffle_f32x4(amz,amz,238)));
     SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
     SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
-                                      force, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, 
+                                    _mm512_castsi512_si256(joffset2),
+                                    force, _MM_SCALE_2);
     jfrc = jfrc + amxd;
-    _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
-                                      force + 1, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2),
+                              jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, 
+                                    _mm512_castsi512_si256(joffset2),
+                                    force + 1, _MM_SCALE_2);
     jfrc = jfrc + amyd;
-    _mm512_mask_i32loscatter_pd(force+1, rmask2, joffset2, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
-                                      force + 2, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+1, rmask2,
+                              _mm512_castsi512_si256(joffset2), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
+                                    _mm512_castsi512_si256(joffset2),
+                                    force + 2, _MM_SCALE_2);
     jfrc = jfrc + amzd;
-    _mm512_mask_i32loscatter_pd(force+2, rmask2, joffset2, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, rmask2, 
+                              _mm512_castsi512_si256(joffset2), jfrc,
+                              _MM_SCALE_2);
   }
 
   inline void SIMD_jforce_update(const SIMD_mask &m, float *force,
@@ -1064,18 +1099,24 @@ namespace ip_simd {
                                  const SIMD_int &i, const SIMD_double &fx,
                                  const SIMD_double &fy, const SIMD_double &fz)   {
     SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
-                                      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                    _mm512_castsi512_si256(i), force,
+                                    _MM_SCALE_2);
     jfrc = jfrc - fx;
-    _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force + 1,
+                                    _MM_SCALE_2);
     jfrc = jfrc - fy;
-    _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force + 2,
+                                    _MM_SCALE_2);
     jfrc = jfrc - fz;
-    _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
   }
 
   inline void SIMD_jforce_update(const SIMD_mask &rmask,
@@ -1502,11 +1543,12 @@ namespace ip_simd {
       fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
       fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
       SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
-                                                    hmask, k, force + 3,
-                                                    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask,
+                                                  _mm512_castsi512_si256(k),
+                                                  force + 3, _MM_SCALE_2);
       keng = keng + hevdwl;
-      _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
+      _mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k),
+                                keng, _MM_SCALE_2);
     }
   }
 
@@ -1523,11 +1565,12 @@ namespace ip_simd {
       fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
       fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
       SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
-                                                    hmask, k, force + 3,
-                                                    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask,
+                                                  _mm512_castsi512_si256(k),
+                                                  force + 3, _MM_SCALE_2);
       keng = keng + hevdwl;
-      _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
+      _mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k),
+                                keng, _MM_SCALE_2);
     }
     SIMD_mask hmask2 = hmask >> 8;
     facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(
@@ -1539,11 +1582,13 @@ namespace ip_simd {
       fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl);
       SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238);
       SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
-                                                    hmask2, k2, force + 3,
-                                                    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(),
+                                                  hmask2,
+                                                  _mm512_castsi512_si256(k2),
+                                                  force + 3, _MM_SCALE_2);
       keng = keng + hevdwl;
-      _mm512_mask_i32loscatter_pd(force + 3, hmask2, k2, keng, _MM_SCALE_2);
+      _mm512_mask_i32scatter_pd(force + 3, hmask2, _mm512_castsi512_si256(k2),
+                                keng, _MM_SCALE_2);
     }
   }
 
@@ -1815,24 +1860,32 @@ namespace ip_simd {
                                  const int EFLAG, const int eatom,
                                  const SIMD_double &fwtmp) {
     SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
-                                      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force,
+                                    _MM_SCALE_2);
     jfrc = jfrc + fx;
-    _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                    _mm512_castsi512_si256(i), force + 1,
+                                    _MM_SCALE_2);
     jfrc = jfrc + fy;
-    _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force + 2,
+                                    _MM_SCALE_2);
     jfrc = jfrc + fz;
-    _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
     if (EFLAG) {
       if (eatom) {
-        jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i,
-                                          force + 3, _MM_SCALE_2);
+        jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                        _mm512_castsi512_si256(i),
+                                        force + 3, _MM_SCALE_2);
         jfrc = jfrc + fwtmp;
-        _mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2);
+        _mm512_mask_i32scatter_pd(force+3, m, _mm512_castsi512_si256(i), jfrc,
+                                  _MM_SCALE_2);
       }
     }
   }
diff --git a/src/INTEL/npair_full_bin_ghost_intel.cpp b/src/INTEL/npair_full_bin_ghost_intel.cpp
index 082f95721f..e96f2c713d 100644
--- a/src/INTEL/npair_full_bin_ghost_intel.cpp
+++ b/src/INTEL/npair_full_bin_ghost_intel.cpp
@@ -324,7 +324,11 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
               const int bstart = binhead[ibin + binstart[k]];
               const int bend = binhead[ibin + binend[k]];
               #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+              #pragma omp simd
+#else
               #pragma simd
+#endif
               #endif
               for (int jj = bstart; jj < bend; jj++)
                 tj[ncount++] = binpacked[jj];
@@ -345,15 +349,23 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
               const int bstart = binhead[ibin + stencil[k]];
               const int bend = binhead[ibin + stencil[k] + 1];
               #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+              #pragma omp simd
+#else
               #pragma simd
+#endif
               #endif
               for (int jj = bstart; jj < bend; jj++)
                 tj[ncount++] = binpacked[jj];
             }
           } // if i < nlocal
           #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma vector aligned
           #endif
           for (int u = 0; u < ncount; u++) {
             const int j = tj[u];
@@ -425,12 +437,16 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
           int alln = n;
           n = 0;
           #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
           #ifdef LMP_INTEL_NBOR_COMPAT
           #pragma ivdep
           #else
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
           #endif
+          #pragma vector aligned
           #endif
           for (int u = 0; u < alln; u++) {
             int which;
@@ -454,12 +470,16 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
           alln = n2;
           n2 = maxnbors * 2;
           #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
           #ifdef LMP_INTEL_NBOR_COMPAT
           #pragma ivdep
           #else
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
           #endif
+          #pragma vector aligned
           #endif
           for (int u = n2; u < alln; u++) {
             int which;
diff --git a/src/INTEL/npair_intel.cpp b/src/INTEL/npair_intel.cpp
index 643ceff8f3..395e50006c 100644
--- a/src/INTEL/npair_intel.cpp
+++ b/src/INTEL/npair_intel.cpp
@@ -344,14 +344,22 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
             const int bstart = binhead[ibin + binstart[k]];
             const int bend = binhead[ibin + binend[k]];
             #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
             #pragma simd
+#endif
             #endif
             for (int jj = bstart; jj < bend; jj++)
               tj[ncount++] = binpacked[jj];
           }
           #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma vector aligned
           #endif
           for (int u = 0; u < ncount; u++) {
             const int j = tj[u];
@@ -375,7 +383,11 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
             const int bstart = binhead[ibin];
             const int bend = binhead[ibin + 1];
             #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
             #pragma simd
+#endif
             #endif
             for (int jj = bstart; jj < bend; jj++) {
               const int j = binpacked[jj];
@@ -533,12 +545,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
 
           n = pack_offset;
           #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
           #ifdef LMP_INTEL_NBOR_COMPAT
           #pragma ivdep
           #else
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
           #endif
+          #pragma vector aligned
           #endif
           for (int u = n; u < alln; u++) {
             int which;
@@ -566,12 +582,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
             n2 = pack_offset + maxnbors;
 
             #if defined(LMP_SIMD_COMPILER)
-            #pragma vector aligned
             #ifdef LMP_INTEL_NBOR_COMPAT
             #pragma ivdep
             #else
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
             #pragma simd
+#endif
             #endif
+            #pragma vector aligned
             #endif
             for (int u = n2; u < alln; u++) {
               int which;
@@ -737,8 +757,14 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
           int jnum = numneigh[i];
           if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
           #if __INTEL_COMPILER+0 > 1499
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd reduction(max:vlmax,vgmax) \
+            reduction(min:vlmin, vgmin)
+#else
+          #pragma simd reduction(max:vlmax,vgmax) \
+            reduction(min:vlmin, vgmin)
+#endif
           #pragma vector aligned
-          #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
           #endif
           for (int jj = 0; jj < jnum; jj++) {
             const int j = jlist[jj] & NEIGHMASK;
@@ -782,8 +808,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
           int jnum = numneigh[i];
           if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
           int jj = 0;
-          #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma vector aligned
           for (jj = 0; jj < jnum; jj++) {
             const int which = jlist[jj] >> SBBITS & 3;
             const int j = jlist[jj] & NEIGHMASK;
diff --git a/src/INTEL/pair_buck_coul_cut_intel.cpp b/src/INTEL/pair_buck_coul_cut_intel.cpp
index 99905bfaa0..c67450fbc1 100644
--- a/src/INTEL/pair_buck_coul_cut_intel.cpp
+++ b/src/INTEL/pair_buck_coul_cut_intel.cpp
@@ -248,12 +248,18 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
         fxtmp = fytmp = fztmp = (acc_t)0;
         if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
         if (NEWTON_PAIR == 0)
-          if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+          if (vflag == VIRIAL_PAIR)
+            sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)
+#else
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                                sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
+        #pragma vector aligned
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcecoul, forcebuck, evdwl, ecoul;
diff --git a/src/INTEL/pair_buck_coul_long_intel.cpp b/src/INTEL/pair_buck_coul_long_intel.cpp
index 1566ec23b6..7c795d5914 100644
--- a/src/INTEL/pair_buck_coul_long_intel.cpp
+++ b/src/INTEL/pair_buck_coul_long_intel.cpp
@@ -309,9 +309,14 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
         #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-                                 sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
           flt_t forcecoul, forcebuck, evdwl, ecoul;
diff --git a/src/INTEL/pair_buck_intel.cpp b/src/INTEL/pair_buck_intel.cpp
index 26ef13be9a..ddab17765b 100644
--- a/src/INTEL/pair_buck_intel.cpp
+++ b/src/INTEL/pair_buck_intel.cpp
@@ -230,12 +230,18 @@ void PairBuckIntel::eval(const int offload, const int vflag,
         fxtmp = fytmp = fztmp = (acc_t)0;
         if (EFLAG) fwtmp = sevdwl =  (acc_t)0;
         if (NEWTON_PAIR == 0)
-          if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+          if (vflag == VIRIAL_PAIR) 
+            sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)
+#else
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                                sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
+        #pragma vector aligned
         #endif
         for (int jj = 0; jj < jnum; jj++) {
 
diff --git a/src/INTEL/pair_dpd_intel.cpp b/src/INTEL/pair_dpd_intel.cpp
index e7514a1f95..a9eb4fe6a4 100644
--- a/src/INTEL/pair_dpd_intel.cpp
+++ b/src/INTEL/pair_dpd_intel.cpp
@@ -289,9 +289,14 @@ void PairDPDIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)
+#else
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-                                 sv0, sv1, sv2, sv3, sv4, sv5)
+                               sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
+        #pragma vector aligned
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcelj, evdwl;
diff --git a/src/INTEL/pair_eam_intel.cpp b/src/INTEL/pair_eam_intel.cpp
index dcff8957fd..13dbd60cb3 100644
--- a/src/INTEL/pair_eam_intel.cpp
+++ b/src/INTEL/pair_eam_intel.cpp
@@ -327,8 +327,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:rhoi)
+#else
         #pragma simd reduction(+:rhoi)
+#endif
+        #pragma vector aligned
         #endif
         for (int jj = 0; jj < ej; jj++) {
           int jtype;
@@ -369,23 +373,35 @@ void PairEAMIntel::eval(const int offload, const int vflag,
           const int rcount = nall;
           if (nthreads == 2) {
             double *trho2 = rho + nmax;
-            #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
             #pragma simd
+#endif
+            #pragma vector aligned
             for (int n = 0; n < rcount; n++)
               rho[n] += trho2[n];
           } else if (nthreads == 4) {
             double *trho2 = rho + nmax;
             double *trho3 = trho2 + nmax;
             double *trho4 = trho3 + nmax;
-            #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
             #pragma simd
+#endif
+            #pragma vector aligned
             for (int n = 0; n < rcount; n++)
               rho[n] += trho2[n] + trho3[n] + trho4[n];
           } else {
             double *trhon = rho + nmax;
             for (int t = 1; t < nthreads; t++) {
-              #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+              #pragma omp simd
+#else
               #pragma simd
+#endif
+              #pragma vector aligned
               for (int n = 0; n < rcount; n++)
                 rho[n] += trhon[n];
               trhon += nmax;
@@ -414,8 +430,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
       if (EFLAG) tevdwl = (acc_t)0.0;
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd reduction(+:tevdwl)
+#else
       #pragma simd reduction(+:tevdwl)
+#endif
+      #pragma vector aligned
       #endif
       for (int ii = iifrom; ii < iito; ++ii) {
         const int i = ilist[ii];
@@ -510,9 +530,14 @@ void PairEAMIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)
+#else
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-                                 sv0, sv1, sv2, sv3, sv4, sv5)
+                               sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
+        #pragma vector aligned
         #endif
         for (int jj = 0; jj < ej; jj++) {
           int jtype;
diff --git a/src/INTEL/pair_gayberne_intel.cpp b/src/INTEL/pair_gayberne_intel.cpp
index d7becc7585..c3abf68c12 100644
--- a/src/INTEL/pair_gayberne_intel.cpp
+++ b/src/INTEL/pair_gayberne_intel.cpp
@@ -449,9 +449,14 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
         __assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
         #endif
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp, \
+                                   t3tmp,sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
+#else
+        #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp, \
+                               t3tmp,sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
+#endif
         #pragma vector aligned
-        #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
-                                 sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
         #endif
         for (int jj = 0; jj < packed_j; jj++) {
           flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
@@ -806,8 +811,12 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
         acc_t *f_scalar2 = f_scalar + fst4;
         for (int t = 1; t < nthreads; t++) {
           #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma vector aligned
           #endif
           for (int n = iifrom * 8; n < sto; n++)
             f_scalar[n] += f_scalar2[n];
diff --git a/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp b/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
index ad8ef4d84f..ef26f8f2d5 100644
--- a/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
+++ b/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
@@ -294,9 +294,14 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
         #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-                               sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
           flt_t forcecoul, forcelj, evdwl;
diff --git a/src/INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
index a910c74acb..6f6bb3618e 100644
--- a/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
@@ -314,9 +314,14 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
         #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-                               sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
           flt_t forcecoul, forcelj, evdwl, ecoul;
diff --git a/src/INTEL/pair_lj_cut_coul_long_intel.cpp b/src/INTEL/pair_lj_cut_coul_long_intel.cpp
index 51e208314b..0d94fdb4c3 100644
--- a/src/INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/INTEL/pair_lj_cut_coul_long_intel.cpp
@@ -305,9 +305,14 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
         #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-                                 sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
           flt_t forcecoul, forcelj, evdwl, ecoul;
diff --git a/src/INTEL/pair_lj_cut_intel.cpp b/src/INTEL/pair_lj_cut_intel.cpp
index 84bc664e18..cf84cb3ca5 100644
--- a/src/INTEL/pair_lj_cut_intel.cpp
+++ b/src/INTEL/pair_lj_cut_intel.cpp
@@ -241,9 +241,15 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
           if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)         \
+          aligned(jlist,x,ljc12oi,special_lj,f,lj34i:64)
+#else
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                                sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma vector aligned
+#endif
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcelj, evdwl;
diff --git a/src/INTEL/pair_sw_intel.cpp b/src/INTEL/pair_sw_intel.cpp
index 17dffa2843..57a6b29945 100644
--- a/src/INTEL/pair_sw_intel.cpp
+++ b/src/INTEL/pair_sw_intel.cpp
@@ -371,8 +371,12 @@ void PairSWIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
+#else
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
+#endif
+        #pragma vector aligned
         #endif
         for (int jj = 0; jj < ejnum_pad; jj++) {
           acc_t fjxtmp, fjytmp, fjztmp, fjtmp;
diff --git a/src/INTEL/pppm_disp_intel.cpp b/src/INTEL/pppm_disp_intel.cpp
index 8d4ed1778d..6b732ccfac 100644
--- a/src/INTEL/pppm_disp_intel.cpp
+++ b/src/INTEL/pppm_disp_intel.cpp
@@ -770,8 +770,12 @@ void PPPMDispIntel::particle_map(double delx, double dely, double delz,
     IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
 
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:flag)
+#else
     #pragma simd reduction(+:flag)
+#endif
+    #pragma vector aligned
     #endif
     for (int i = iifrom; i < iito; i++) {
 
@@ -876,7 +880,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho_lookup[idx][k];
@@ -885,7 +893,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower; k <= nupper; k++) {
           FFT_SCALAR r1,r2,r3;
@@ -917,8 +929,12 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           int mzy = m*nix + mz;
           FFT_SCALAR x0 = y0*rho[1][m];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mzyx = l + mzy;
@@ -939,7 +955,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
     IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
 
     #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
     #endif
     for (int i = ifrom; i < ito; i++) {
       for (int j = 1; j < nthr; j++) {
@@ -1025,7 +1045,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho6_lookup[idx][k];
@@ -1034,7 +1058,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1,r2,r3;
@@ -1067,8 +1095,12 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           int mzy = m*nix + mz;
           FFT_SCALAR x0 = y0*rho[1][m];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mzyx = l + mzy;
@@ -1089,7 +1121,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
     IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr);
 
     #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
     #endif
     for (int i = ifrom; i < ito; i++) {
       for (int j = 1; j < nthr; j++) {
@@ -1173,7 +1209,11 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho6_lookup[idx][k];
@@ -1182,7 +1222,11 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1,r2,r3;
@@ -1215,8 +1259,12 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           int my = m + nysum;
           FFT_SCALAR x0 = y0*rho[1][m];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mx = l + nxsum;
@@ -1307,7 +1355,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho6_lookup[idx][k];
@@ -1316,7 +1368,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1,r2,r3;
@@ -1349,8 +1405,12 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           int mzy = m*nix + mz;
           FFT_SCALAR x0 = y0*rho[1][m];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mzyx = l + mzy;
@@ -1373,7 +1433,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
     IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr);
 
     #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
     #endif
     for (int i = ifrom; i < ito; i++) {
       for (int j = 1; j < nthr; j++) {
@@ -1454,7 +1518,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho0[k] = rho_lookup[idx][k];
@@ -1463,7 +1531,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower; k <= nupper; k++) {
           FFT_SCALAR r1 = rho_coeff[order-1][k];
@@ -1498,8 +1570,12 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           int my = m+nysum;
           FFT_SCALAR y0 = z0*rho1[m];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mx = l+nxsum;
@@ -1624,7 +1700,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         int idz = dz;
 
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho_lookup[idx][k];
@@ -1636,7 +1716,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower; k <= nupper; k++) {
           FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@@ -1680,8 +1764,12 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
           FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mx = l + nxsum;
@@ -1702,7 +1790,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
       }
     }
     #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
     #endif
     for (int i = ifrom; i < ito; i++) {
       particle_ekx[i] *= hx_inv;
@@ -1802,7 +1894,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho0[k] = rho6_lookup[idx][k];
@@ -1811,7 +1907,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
@@ -1846,8 +1946,12 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           int my = m+nysum;
           FFT_SCALAR y0 = z0*rho1[m];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mx = l+nxsum;
@@ -1967,7 +2071,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         int idz = dz;
 
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho6_lookup[idx][k];
@@ -1979,7 +2087,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@@ -2023,8 +2135,12 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
           FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mx = l + nxsum;
@@ -2045,7 +2161,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
       }
     }
     #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
     #endif
     for (int i = ifrom; i < ito; i++) {
       particle_ekx[i] *= hx_inv;
@@ -2143,7 +2263,11 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho0[k] = rho6_lookup[idx][k];
@@ -2152,7 +2276,11 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
@@ -2206,8 +2334,12 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           int my = m+nysum;
           FFT_SCALAR y0 = z0*rho1[m];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mx = l+nxsum;
@@ -2398,7 +2530,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         int idz = dz;
 
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho6_lookup[idx][k];
@@ -2410,7 +2546,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@@ -2479,8 +2619,12 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
           FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mx = l + nxsum;
@@ -2541,7 +2685,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
       }
     }
     #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
     #endif
     for (int i = ifrom; i < ito; i++) {
       particle_ekx0[i] *= hx_inv;
@@ -2671,7 +2819,11 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho0[k] = rho6_lookup[idx][k];
@@ -2680,7 +2832,11 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
@@ -2721,8 +2877,12 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
             int my = m+nysum;
             FFT_SCALAR y0 = z0*rho1[m];
             #if defined(LMP_SIMD_COMPILER)
-            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
             #pragma simd
+#endif
+            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
             #endif
             for (int l = 0; l < order; l++) {
               int mx = l+nxsum;
@@ -2848,7 +3008,11 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         int idz = dz;
 
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho6_lookup[idx][k];
@@ -2860,7 +3024,11 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@@ -2909,8 +3077,12 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
             FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
             FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
             #if defined(LMP_SIMD_COMPILER)
-            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
             #pragma simd
+#endif
+            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
             #endif
             for (int l = 0; l < order; l++) {
               int mx = l + nxsum;
@@ -2992,7 +3164,11 @@ void PPPMDispIntel::precompute_rho()
     for (int i = 0; i < rho_points; i++) {
       FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
       #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd
+#else
       #pragma simd
+#endif
       #endif
       for (int k=nlower; k<=nupper;k++) {
         FFT_SCALAR r1 = ZEROF;
@@ -3006,7 +3182,11 @@ void PPPMDispIntel::precompute_rho()
       }
       if (differentiation_flag == 1) {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k=nlower; k<=nupper;k++) {
           FFT_SCALAR r1 = ZEROF;
@@ -3026,7 +3206,11 @@ void PPPMDispIntel::precompute_rho()
     for (int i = 0; i < rho_points; i++) {
       FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
       #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd
+#else
       #pragma simd
+#endif
       #endif
       for (int k=nlower_6; k<=nupper_6;k++) {
         FFT_SCALAR r1 = ZEROF;
@@ -3040,7 +3224,11 @@ void PPPMDispIntel::precompute_rho()
       }
       if (differentiation_flag == 1) {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k=nlower_6; k<=nupper_6;k++) {
           FFT_SCALAR r1 = ZEROF;
diff --git a/src/INTEL/pppm_intel.cpp b/src/INTEL/pppm_intel.cpp
index 8b0542d770..8041709ebc 100644
--- a/src/INTEL/pppm_intel.cpp
+++ b/src/INTEL/pppm_intel.cpp
@@ -394,8 +394,12 @@ void PPPMIntel::particle_map(IntelBuffers<flt_t,acc_t> *buffers)
     IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
 
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:flag)
+#else
     #pragma simd reduction(+:flag)
+#endif
+    #pragma vector aligned
     #endif
     for (int i = iifrom; i < iito; i++) {
 
@@ -500,7 +504,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho_lookup[idx][k];
@@ -509,7 +517,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower; k <= nupper; k++) {
           FFT_SCALAR r1,r2,r3;
@@ -541,7 +553,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
           int mzy = m*nix + mz;
           FFT_SCALAR x0 = y0*rho[1][m];
           #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
           #endif
           for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
             int mzyx = l + mzy;
@@ -563,7 +579,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
       IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
 
       #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd
+#else
       #pragma simd
+#endif
       #endif
       for (int i = ifrom; i < ito; i++) {
         for (int j = 1; j < nthr; j++) {
@@ -645,7 +665,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho0[k] = rho_lookup[idx][k];
@@ -654,7 +678,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower; k <= nupper; k++) {
           FFT_SCALAR r1 = rho_coeff[order-1][k];
@@ -690,7 +718,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
           int my = m+nysum;
           FFT_SCALAR y0 = z0*rho1[m];
           #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
           #endif
           for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
             int mx = l+nxsum;
@@ -813,7 +845,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho_lookup[idx][k];
@@ -825,7 +861,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower; k <= nupper; k++) {
           FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@@ -871,7 +911,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
           FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
           FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
           #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
           #endif
           for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
             int mx = l + nxsum;
@@ -893,7 +937,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
     }
 
     #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
     #endif
     for (int i = ifrom; i < ito; i++) {
       particle_ekx[i] *= hx_inv;
@@ -942,7 +990,11 @@ void PPPMIntel::precompute_rho()
   for (int i = 0; i < rho_points; i++) {
     FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
     #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
     #endif
     for (int k=nlower; k<=nupper;k++) {
       FFT_SCALAR r1 = ZEROF;
@@ -956,7 +1008,11 @@ void PPPMIntel::precompute_rho()
     }
     if (differentiation_flag == 1) {
       #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd
+#else
       #pragma simd
+#endif
       #endif
       for (int k=nlower; k<=nupper;k++) {
         FFT_SCALAR r1 = ZEROF;

From af359df04255b491aafeefd8fc6a49e12f86ad43 Mon Sep 17 00:00:00 2001
From: Mike Brown <michael.w.brown@intel.com>
Date: Mon, 26 Jul 2021 12:04:31 -0700
Subject: [PATCH 11/17] Allowing nofdotr for Intel package with newton off.

---
 src/INTEL/pair_airebo_intel.cpp                | 5 +++--
 src/INTEL/pair_buck_coul_cut_intel.cpp         | 5 +++--
 src/INTEL/pair_buck_coul_long_intel.cpp        | 5 +++--
 src/INTEL/pair_buck_intel.cpp                  | 5 +++--
 src/INTEL/pair_dpd_intel.cpp                   | 5 +++--
 src/INTEL/pair_eam_intel.cpp                   | 5 +++--
 src/INTEL/pair_gayberne_intel.cpp              | 5 +++--
 src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp | 5 +++--
 src/INTEL/pair_lj_charmm_coul_long_intel.cpp   | 5 +++--
 src/INTEL/pair_lj_cut_coul_long_intel.cpp      | 5 +++--
 src/INTEL/pair_lj_cut_intel.cpp                | 5 +++--
 src/INTEL/pair_sw_intel.cpp                    | 5 +++--
 src/INTEL/pair_tersoff_intel.cpp               | 5 +++--
 13 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/src/INTEL/pair_airebo_intel.cpp b/src/INTEL/pair_airebo_intel.cpp
index 5ea0d3168b..be38bbe418 100644
--- a/src/INTEL/pair_airebo_intel.cpp
+++ b/src/INTEL/pair_airebo_intel.cpp
@@ -292,8 +292,9 @@ void PairAIREBOIntel::compute(
   ev_init(eflag,vflag);
   if (vflag_atom)
     error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");
 
   pvector[0] = pvector[1] = pvector[2] = 0.0;
 
diff --git a/src/INTEL/pair_buck_coul_cut_intel.cpp b/src/INTEL/pair_buck_coul_cut_intel.cpp
index c67450fbc1..9181b15aae 100644
--- a/src/INTEL/pair_buck_coul_cut_intel.cpp
+++ b/src/INTEL/pair_buck_coul_cut_intel.cpp
@@ -77,8 +77,9 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag,
   ev_init(eflag,vflag);
   if (vflag_atom)
     error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
diff --git a/src/INTEL/pair_buck_coul_long_intel.cpp b/src/INTEL/pair_buck_coul_long_intel.cpp
index 7c795d5914..fdf78ff5d9 100644
--- a/src/INTEL/pair_buck_coul_long_intel.cpp
+++ b/src/INTEL/pair_buck_coul_long_intel.cpp
@@ -77,8 +77,9 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag,
   ev_init(eflag,vflag);
   if (vflag_atom)
     error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
diff --git a/src/INTEL/pair_buck_intel.cpp b/src/INTEL/pair_buck_intel.cpp
index ddab17765b..2dae75f920 100644
--- a/src/INTEL/pair_buck_intel.cpp
+++ b/src/INTEL/pair_buck_intel.cpp
@@ -70,8 +70,9 @@ void PairBuckIntel::compute(int eflag, int vflag,
   ev_init(eflag,vflag);
   if (vflag_atom)
     error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
diff --git a/src/INTEL/pair_dpd_intel.cpp b/src/INTEL/pair_dpd_intel.cpp
index a9eb4fe6a4..b69473fbbf 100644
--- a/src/INTEL/pair_dpd_intel.cpp
+++ b/src/INTEL/pair_dpd_intel.cpp
@@ -89,8 +89,9 @@ void PairDPDIntel::compute(int eflag, int vflag,
   ev_init(eflag, vflag);
   if (vflag_atom)
     error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
diff --git a/src/INTEL/pair_eam_intel.cpp b/src/INTEL/pair_eam_intel.cpp
index 13dbd60cb3..911a623d6d 100644
--- a/src/INTEL/pair_eam_intel.cpp
+++ b/src/INTEL/pair_eam_intel.cpp
@@ -82,8 +82,9 @@ void PairEAMIntel::compute(int eflag, int vflag,
   ev_init(eflag, vflag);
   if (vflag_atom)
     error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
diff --git a/src/INTEL/pair_gayberne_intel.cpp b/src/INTEL/pair_gayberne_intel.cpp
index c3abf68c12..285dd9661b 100644
--- a/src/INTEL/pair_gayberne_intel.cpp
+++ b/src/INTEL/pair_gayberne_intel.cpp
@@ -76,8 +76,9 @@ void PairGayBerneIntel::compute(int eflag, int vflag,
   ev_init(eflag, vflag);
   if (vflag_atom)
     error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");
 
   const int inum = list->inum;
   const int nall = atom->nlocal + atom->nghost;
diff --git a/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp b/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
index ef26f8f2d5..3a7ab5276e 100644
--- a/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
+++ b/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
@@ -73,8 +73,9 @@ void PairLJCharmmCoulCharmmIntel::compute(int eflag, int vflag,
   ev_init(eflag,vflag);
   if (vflag_atom)
     error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
diff --git a/src/INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
index 6f6bb3618e..8a4595dd99 100644
--- a/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
@@ -77,8 +77,9 @@ void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
   ev_init(eflag,vflag);
   if (vflag_atom)
     error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
diff --git a/src/INTEL/pair_lj_cut_coul_long_intel.cpp b/src/INTEL/pair_lj_cut_coul_long_intel.cpp
index 0d94fdb4c3..a53d74f72a 100644
--- a/src/INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/INTEL/pair_lj_cut_coul_long_intel.cpp
@@ -76,8 +76,9 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
   ev_init(eflag,vflag);
   if (vflag_atom)
     error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
diff --git a/src/INTEL/pair_lj_cut_intel.cpp b/src/INTEL/pair_lj_cut_intel.cpp
index cf84cb3ca5..133b6079a1 100644
--- a/src/INTEL/pair_lj_cut_intel.cpp
+++ b/src/INTEL/pair_lj_cut_intel.cpp
@@ -68,8 +68,9 @@ void PairLJCutIntel::compute(int eflag, int vflag,
   ev_init(eflag, vflag);
   if (vflag_atom)
     error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
diff --git a/src/INTEL/pair_sw_intel.cpp b/src/INTEL/pair_sw_intel.cpp
index 57a6b29945..f494965ff8 100644
--- a/src/INTEL/pair_sw_intel.cpp
+++ b/src/INTEL/pair_sw_intel.cpp
@@ -97,8 +97,9 @@ void PairSWIntel::compute(int eflag, int vflag,
   ev_init(eflag, vflag);
   if (vflag_atom)
     error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
diff --git a/src/INTEL/pair_tersoff_intel.cpp b/src/INTEL/pair_tersoff_intel.cpp
index d4b8f7d499..732d100927 100644
--- a/src/INTEL/pair_tersoff_intel.cpp
+++ b/src/INTEL/pair_tersoff_intel.cpp
@@ -91,8 +91,9 @@ void PairTersoffIntel::compute(int eflag, int vflag,
   ev_init(eflag,vflag);
   if (vflag_atom)
     error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;

From 829bc8a617910d2226f67d8ee27c57daced0eb1f Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 26 Jul 2021 18:45:37 -0400
Subject: [PATCH 12/17] update equation for updated NEB code

---
 doc/src/fix_neb.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/fix_neb.rst b/doc/src/fix_neb.rst
index 848369d3ea..5e57da328a 100644
--- a/doc/src/fix_neb.rst
+++ b/doc/src/fix_neb.rst
@@ -89,7 +89,7 @@ first stage) is changed to:
 
 .. parsed-literal::
 
-   Fi = -Grad(V) + 2 (Grad(V) dot T') T'
+   Fi = -Grad(V) + 2 (Grad(V) dot T') T' + Fnudge_perp
 
 and the relaxation procedure is continued to a new converged MEP.
 

From 16fae72670b9eebaf3dbec4805d7dd8094f1ca61 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 27 Jul 2021 08:44:59 -0400
Subject: [PATCH 13/17] small tweak for MinGW-64 compilation on Fedora 34

---
 cmake/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 7269fc5c9b..b76163aaef 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -363,6 +363,8 @@ if(PKG_MSCG OR PKG_ATC OR PKG_AWPMD OR PKG_ML-QUIP OR PKG_LATTE)
   endif()
 endif()
 
+# tweak jpeg library names to avoid linker errors with MinGW cross-compilation
+set(JPEG_NAMES libjpeg libjpeg-62)
 find_package(JPEG QUIET)
 option(WITH_JPEG "Enable JPEG support" ${JPEG_FOUND})
 if(WITH_JPEG)

From f39c5178d81276070bc4b29c7c487e21494e7e5f Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 27 Jul 2021 09:56:52 -0400
Subject: [PATCH 14/17] update plumed to version 2.7.2 and drop workaround for
 2.7.1

---
 cmake/Modules/Packages/PLUMED.cmake      | 5 ++---
 lib/plumed/Install.py                    | 1 +
 tools/offline/scripts/init_http_cache.sh | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmake/Modules/Packages/PLUMED.cmake b/cmake/Modules/Packages/PLUMED.cmake
index e9eba779f5..0f063f3e14 100644
--- a/cmake/Modules/Packages/PLUMED.cmake
+++ b/cmake/Modules/Packages/PLUMED.cmake
@@ -54,8 +54,8 @@ if(DOWNLOAD_PLUMED)
     set(PLUMED_BUILD_BYPRODUCTS "<INSTALL_DIR>/lib/libplumedWrapper.a")
   endif()
 
-  set(PLUMED_URL "https://github.com/plumed/plumed2/releases/download/v2.7.1/plumed-src-2.7.1.tgz" CACHE STRING "URL for PLUMED tarball")
-  set(PLUMED_MD5 "4eac6a462ec84dfe0cec96c82421b8e8" CACHE STRING "MD5 checksum of PLUMED tarball")
+  set(PLUMED_URL "https://github.com/plumed/plumed2/releases/download/v2.7.2/plumed-src-2.7.2.tgz" CACHE STRING "URL for PLUMED tarball")
+  set(PLUMED_MD5 "cfa0b4dd90a81c25d3302e8d97bfeaea" CACHE STRING "MD5 checksum of PLUMED tarball")
 
   mark_as_advanced(PLUMED_URL)
   mark_as_advanced(PLUMED_MD5)
@@ -72,7 +72,6 @@ if(DOWNLOAD_PLUMED)
                                              ${PLUMED_CONFIG_OMP}
                                              CXX=${PLUMED_CONFIG_CXX}
                                              CC=${PLUMED_CONFIG_CC}
-    PATCH_COMMAND sed -i "/^#include <algorithm>/a #include <limits>" <SOURCE_DIR>/src/lepton/Operation.h
     BUILD_BYPRODUCTS ${PLUMED_BUILD_BYPRODUCTS}
   )
   ExternalProject_get_property(plumed_build INSTALL_DIR)
diff --git a/lib/plumed/Install.py b/lib/plumed/Install.py
index e3858b39d3..548e51a5bc 100644
--- a/lib/plumed/Install.py
+++ b/lib/plumed/Install.py
@@ -53,6 +53,7 @@ checksums = { \
         '2.6.3' : 'a9f8028fd74528c2024781ea1fdefeee', \
         '2.7.0' : '95f29dd0c067577f11972ff90dfc7d12', \
         '2.7.1' : '4eac6a462ec84dfe0cec96c82421b8e8', \
+        '2.7.2' : 'cfa0b4dd90a81c25d3302e8d97bfeaea', \
         }
 
 # parse and process arguments
diff --git a/tools/offline/scripts/init_http_cache.sh b/tools/offline/scripts/init_http_cache.sh
index 55856bbf21..44a07da35a 100755
--- a/tools/offline/scripts/init_http_cache.sh
+++ b/tools/offline/scripts/init_http_cache.sh
@@ -50,7 +50,7 @@ CUB_URL="https://github.com/NVlabs/cub/archive/1.12.0.tar.gz"
 KOKKOS_URL="https://github.com/kokkos/kokkos/archive/3.4.01.tar.gz"
 KIM_URL="https://s3.openkim.org/kim-api/kim-api-2.2.1.txz"
 MSCG_URL="https://github.com/uchicago-voth/MSCG-release/archive/1.7.3.1.tar.gz"
-PLUMED_URL="https://github.com/plumed/plumed2/releases/download/v2.7.1/plumed-src-2.7.1.tgz"
+PLUMED_URL="https://github.com/plumed/plumed2/releases/download/v2.7.2/plumed-src-2.7.2.tgz"
 PACELIB_URL="https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/v.2021.4.9.tar.gz"
 LATTE_URL="https://github.com/lanl/LATTE/archive/v1.2.2.tar.gz"
 SCAFACOS_URL="https://github.com/scafacos/scafacos/releases/download/v1.0.1/scafacos-1.0.1.tar.gz"

From ec069595f767b5285c55dda69834e819ad3c3824 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 27 Jul 2021 11:16:40 -0400
Subject: [PATCH 15/17] move compute orientorder/atom because compute
 coord/atom depends on it

---
 doc/src/compute_orientorder_atom.rst                 | 3 +--
 src/{EXTRA-COMPUTE => }/compute_orientorder_atom.cpp | 0
 src/{EXTRA-COMPUTE => }/compute_orientorder_atom.h   | 0
 3 files changed, 1 insertion(+), 2 deletions(-)
 rename src/{EXTRA-COMPUTE => }/compute_orientorder_atom.cpp (100%)
 rename src/{EXTRA-COMPUTE => }/compute_orientorder_atom.h (100%)

diff --git a/doc/src/compute_orientorder_atom.rst b/doc/src/compute_orientorder_atom.rst
index 2efea311a0..bc608e21d7 100644
--- a/doc/src/compute_orientorder_atom.rst
+++ b/doc/src/compute_orientorder_atom.rst
@@ -182,8 +182,7 @@ page for an overview of LAMMPS output options.
 Restrictions
 """"""""""""
 
-This compute is part of the EXTRA-COMPUTE package.  It is only enabled if
-LAMMPS was built with that package.  See the :doc:`Build package <Build_package>` page for more info.
+none
 
 Related commands
 """"""""""""""""
diff --git a/src/EXTRA-COMPUTE/compute_orientorder_atom.cpp b/src/compute_orientorder_atom.cpp
similarity index 100%
rename from src/EXTRA-COMPUTE/compute_orientorder_atom.cpp
rename to src/compute_orientorder_atom.cpp
diff --git a/src/EXTRA-COMPUTE/compute_orientorder_atom.h b/src/compute_orientorder_atom.h
similarity index 100%
rename from src/EXTRA-COMPUTE/compute_orientorder_atom.h
rename to src/compute_orientorder_atom.h

From f7f85822a932c63d95a9a2c0bb93a451ea5519ee Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 27 Jul 2021 12:07:12 -0400
Subject: [PATCH 16/17] move some more computes from EXTRA-COMPUTE back to src
 because of dependencies

---
 doc/src/compute_cluster_atom.rst                   | 3 +--
 src/{EXTRA-COMPUTE => }/compute_aggregate_atom.cpp | 0
 src/{EXTRA-COMPUTE => }/compute_aggregate_atom.h   | 0
 src/{EXTRA-COMPUTE => }/compute_cluster_atom.cpp   | 0
 src/{EXTRA-COMPUTE => }/compute_cluster_atom.h     | 0
 src/{EXTRA-COMPUTE => }/compute_fragment_atom.cpp  | 0
 src/{EXTRA-COMPUTE => }/compute_fragment_atom.h    | 0
 7 files changed, 1 insertion(+), 2 deletions(-)
 rename src/{EXTRA-COMPUTE => }/compute_aggregate_atom.cpp (100%)
 rename src/{EXTRA-COMPUTE => }/compute_aggregate_atom.h (100%)
 rename src/{EXTRA-COMPUTE => }/compute_cluster_atom.cpp (100%)
 rename src/{EXTRA-COMPUTE => }/compute_cluster_atom.h (100%)
 rename src/{EXTRA-COMPUTE => }/compute_fragment_atom.cpp (100%)
 rename src/{EXTRA-COMPUTE => }/compute_fragment_atom.h (100%)

diff --git a/doc/src/compute_cluster_atom.rst b/doc/src/compute_cluster_atom.rst
index f82b2affa5..32954480cc 100644
--- a/doc/src/compute_cluster_atom.rst
+++ b/doc/src/compute_cluster_atom.rst
@@ -119,8 +119,7 @@ The per-atom vector values will be an ID > 0, as explained above.
 Restrictions
 """"""""""""
 
-These computes are part of the EXTRA-COMPUTE package.  They are only enabled if
-LAMMPS was built with that package.  See the :doc:`Build package <Build_package>` page for more info.
+none
 
 Related commands
 """"""""""""""""
diff --git a/src/EXTRA-COMPUTE/compute_aggregate_atom.cpp b/src/compute_aggregate_atom.cpp
similarity index 100%
rename from src/EXTRA-COMPUTE/compute_aggregate_atom.cpp
rename to src/compute_aggregate_atom.cpp
diff --git a/src/EXTRA-COMPUTE/compute_aggregate_atom.h b/src/compute_aggregate_atom.h
similarity index 100%
rename from src/EXTRA-COMPUTE/compute_aggregate_atom.h
rename to src/compute_aggregate_atom.h
diff --git a/src/EXTRA-COMPUTE/compute_cluster_atom.cpp b/src/compute_cluster_atom.cpp
similarity index 100%
rename from src/EXTRA-COMPUTE/compute_cluster_atom.cpp
rename to src/compute_cluster_atom.cpp
diff --git a/src/EXTRA-COMPUTE/compute_cluster_atom.h b/src/compute_cluster_atom.h
similarity index 100%
rename from src/EXTRA-COMPUTE/compute_cluster_atom.h
rename to src/compute_cluster_atom.h
diff --git a/src/EXTRA-COMPUTE/compute_fragment_atom.cpp b/src/compute_fragment_atom.cpp
similarity index 100%
rename from src/EXTRA-COMPUTE/compute_fragment_atom.cpp
rename to src/compute_fragment_atom.cpp
diff --git a/src/EXTRA-COMPUTE/compute_fragment_atom.h b/src/compute_fragment_atom.h
similarity index 100%
rename from src/EXTRA-COMPUTE/compute_fragment_atom.h
rename to src/compute_fragment_atom.h

From d292da78ca705254536b05f2f799bbbf230398d5 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 27 Jul 2021 12:29:44 -0400
Subject: [PATCH 17/17] address CodeQL warnings and reformat with clang-format

---
 src/EXTRA-COMPUTE/compute_hma.cpp            |  28 ++--
 src/TALLY/compute_force_tally.cpp            | 101 +++++++--------
 src/TALLY/compute_heat_flux_tally.cpp        | 101 +++++++--------
 src/TALLY/compute_heat_flux_virial_tally.cpp |   2 +-
 src/TALLY/compute_pe_mol_tally.cpp           |  62 +++++----
 src/TALLY/compute_pe_tally.cpp               |  85 ++++++-------
 src/TALLY/compute_stress_tally.cpp           | 127 ++++++++++---------
 7 files changed, 239 insertions(+), 267 deletions(-)

diff --git a/src/EXTRA-COMPUTE/compute_hma.cpp b/src/EXTRA-COMPUTE/compute_hma.cpp
index b74280a8e5..09a2840906 100644
--- a/src/EXTRA-COMPUTE/compute_hma.cpp
+++ b/src/EXTRA-COMPUTE/compute_hma.cpp
@@ -116,7 +116,7 @@ ComputeHMA::ComputeHMA(LAMMPS *lmp, int narg, char **arg) :
   computeU = computeP = computeCv = -1;
   returnAnharmonic = 0;
   size_vector = 0;
-  memory->create(extlist, 3, "hma:extlist");
+  extlist = new int[3];
   for (int iarg=4; iarg<narg; iarg++) {
     if (!strcmp(arg[iarg], "u")) {
       if (computeU>-1) continue;
@@ -145,20 +145,11 @@ ComputeHMA::ComputeHMA(LAMMPS *lmp, int narg, char **arg) :
     }
   }
 
-  if (size_vector == 0) {
-    error->all(FLERR,"Illegal compute hma command");
-  }
-  if (size_vector<3) {
-    memory->grow(extlist, size_vector, "hma:extlist");
-  }
-  memory->create(vector, size_vector, "hma:vector");
+  if (size_vector == 0) error->all(FLERR,"Illegal compute hma command");
+  vector = new double[size_vector];
 
-  if (computeU>-1 || computeCv>-1) {
-    peflag = 1;
-  }
-  if (computeP>-1) {
-    pressflag = 1;
-  }
+  if (computeU>-1 || computeCv>-1) peflag = 1;
+  if (computeP>-1) pressflag = 1;
 
   nmax = 0;
 }
@@ -170,10 +161,11 @@ ComputeHMA::~ComputeHMA()
   // check nfix in case all fixes have already been deleted
   if (modify->nfix) modify->delete_fix(id_fix);
 
-  delete [] id_fix;
-  delete [] id_temp;
-  memory->destroy(extlist);
-  memory->destroy(vector);
+  delete[] id_fix;
+  delete[] id_temp;
+  delete[] extlist;
+  delete[] vector;
+
   memory->destroy(deltaR);
 }
 
diff --git a/src/TALLY/compute_force_tally.cpp b/src/TALLY/compute_force_tally.cpp
index bc9a79713b..8216269fe0 100644
--- a/src/TALLY/compute_force_tally.cpp
+++ b/src/TALLY/compute_force_tally.cpp
@@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
@@ -14,28 +13,26 @@
 
 #include "compute_force_tally.h"
 
-#include <cmath>
 #include "atom.h"
-#include "group.h"
-#include "pair.h"
-#include "update.h"
-#include "memory.h"
+#include "comm.h"
 #include "error.h"
 #include "force.h"
-#include "comm.h"
+#include "group.h"
+#include "memory.h"
+#include "pair.h"
+#include "update.h"
+#include <cmath>
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-ComputeForceTally::ComputeForceTally(LAMMPS *lmp, int narg, char **arg) :
-  Compute(lmp, narg, arg)
+ComputeForceTally::ComputeForceTally(LAMMPS *lmp, int narg, char **arg) : Compute(lmp, narg, arg)
 {
-  if (narg < 4) error->all(FLERR,"Illegal compute force/tally command");
+  if (narg < 4) error->all(FLERR, "Illegal compute force/tally command");
 
   igroup2 = group->find(arg[3]);
-  if (igroup2 == -1)
-    error->all(FLERR,"Could not find compute force/tally second group ID");
+  if (igroup2 == -1) error->all(FLERR, "Could not find compute force/tally second group ID");
   groupbit2 = group->bitmask[igroup2];
 
   scalar_flag = 1;
@@ -46,7 +43,7 @@ ComputeForceTally::ComputeForceTally(LAMMPS *lmp, int narg, char **arg) :
 
   comm_reverse = size_peratom_cols = 3;
   extscalar = 1;
-  peflag = 1;                   // we need Pair::ev_tally() to be run
+  peflag = 1;    // we need Pair::ev_tally() to be run
 
   did_setup = invoked_peratom = invoked_scalar = -1;
   nmax = -1;
@@ -68,17 +65,16 @@ ComputeForceTally::~ComputeForceTally()
 void ComputeForceTally::init()
 {
   if (force->pair == nullptr)
-    error->all(FLERR,"Trying to use compute force/tally without pair style");
+    error->all(FLERR, "Trying to use compute force/tally without pair style");
   else
     force->pair->add_tally_callback(this);
 
   if (comm->me == 0) {
     if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-      error->warning(FLERR,"Compute force/tally used with incompatible pair style");
+      error->warning(FLERR, "Compute force/tally used with incompatible pair style");
 
-    if (force->bond || force->angle || force->dihedral
-                    || force->improper || force->kspace)
-      error->warning(FLERR,"Compute force/tally only called from pair style");
+    if (force->bond || force->angle || force->dihedral || force->improper || force->kspace)
+      error->warning(FLERR, "Compute force/tally only called from pair style");
   }
   did_setup = -1;
 }
@@ -99,51 +95,48 @@ void ComputeForceTally::pair_setup_callback(int, int)
   if (atom->nmax > nmax) {
     memory->destroy(fatom);
     nmax = atom->nmax;
-    memory->create(fatom,nmax,size_peratom_cols,"force/tally:fatom");
+    memory->create(fatom, nmax, size_peratom_cols, "force/tally:fatom");
     array_atom = fatom;
   }
 
   // clear storage
 
-  for (int i=0; i < ntotal; ++i)
-    for (int j=0; j < size_peratom_cols; ++j)
-      fatom[i][j] = 0.0;
+  for (int i = 0; i < ntotal; ++i)
+    for (int j = 0; j < size_peratom_cols; ++j) fatom[i][j] = 0.0;
 
-  for (int i=0; i < size_peratom_cols; ++i)
-    vector[i] = ftotal[i] = 0.0;
+  for (int i = 0; i < size_peratom_cols; ++i) vector[i] = ftotal[i] = 0.0;
 
   did_setup = update->ntimestep;
 }
 
 /* ---------------------------------------------------------------------- */
-void ComputeForceTally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                            double, double, double fpair,
-                                            double dx, double dy, double dz)
+void ComputeForceTally::pair_tally_callback(int i, int j, int nlocal, int newton, double, double,
+                                            double fpair, double dx, double dy, double dz)
 {
-  const int * const mask = atom->mask;
+  const int *const mask = atom->mask;
 
-  if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
-       || ((mask[i] & groupbit2) && (mask[j] & groupbit))) {
+  if (((mask[i] & groupbit) && (mask[j] & groupbit2)) ||
+      ((mask[i] & groupbit2) && (mask[j] & groupbit))) {
 
     if (newton || i < nlocal) {
       if (mask[i] & groupbit) {
-        ftotal[0] += fpair*dx;
-        ftotal[1] += fpair*dy;
-        ftotal[2] += fpair*dz;
+        ftotal[0] += fpair * dx;
+        ftotal[1] += fpair * dy;
+        ftotal[2] += fpair * dz;
       }
-      fatom[i][0] += fpair*dx;
-      fatom[i][1] += fpair*dy;
-      fatom[i][2] += fpair*dz;
+      fatom[i][0] += fpair * dx;
+      fatom[i][1] += fpair * dy;
+      fatom[i][2] += fpair * dz;
     }
     if (newton || j < nlocal) {
       if (mask[j] & groupbit) {
-        ftotal[0] -= fpair*dx;
-        ftotal[1] -= fpair*dy;
-        ftotal[2] -= fpair*dz;
+        ftotal[0] -= fpair * dx;
+        ftotal[1] -= fpair * dy;
+        ftotal[2] -= fpair * dz;
       }
-      fatom[j][0] -= fpair*dx;
-      fatom[j][1] -= fpair*dy;
-      fatom[j][2] -= fpair*dz;
+      fatom[j][0] -= fpair * dx;
+      fatom[j][1] -= fpair * dy;
+      fatom[j][2] -= fpair * dz;
     }
   }
 }
@@ -152,7 +145,7 @@ void ComputeForceTally::pair_tally_callback(int i, int j, int nlocal, int newton
 
 int ComputeForceTally::pack_reverse_comm(int n, int first, double *buf)
 {
-  int i,m,last;
+  int i, m, last;
 
   m = 0;
   last = first + n;
@@ -168,7 +161,7 @@ int ComputeForceTally::pack_reverse_comm(int n, int first, double *buf)
 
 void ComputeForceTally::unpack_reverse_comm(int n, int *list, double *buf)
 {
-  int i,j,m;
+  int i, j, m;
 
   m = 0;
   for (i = 0; i < n; i++) {
@@ -184,15 +177,14 @@ void ComputeForceTally::unpack_reverse_comm(int n, int *list, double *buf)
 double ComputeForceTally::compute_scalar()
 {
   invoked_scalar = update->ntimestep;
-  if ((did_setup != invoked_scalar)
-      || (update->eflag_global != invoked_scalar))
-    error->all(FLERR,"Energy was not tallied on needed timestep");
+  if ((did_setup != invoked_scalar) || (update->eflag_global != invoked_scalar))
+    error->all(FLERR, "Energy was not tallied on needed timestep");
 
   // sum accumulated forces across procs
 
-  MPI_Allreduce(ftotal,vector,size_peratom_cols,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(ftotal, vector, size_peratom_cols, MPI_DOUBLE, MPI_SUM, world);
 
-  scalar = sqrt(vector[0]*vector[0]+vector[1]*vector[1]+vector[2]*vector[2]);
+  scalar = sqrt(vector[0] * vector[0] + vector[1] * vector[1] + vector[2] * vector[2]);
   return scalar;
 }
 
@@ -201,9 +193,8 @@ double ComputeForceTally::compute_scalar()
 void ComputeForceTally::compute_peratom()
 {
   invoked_peratom = update->ntimestep;
-  if ((did_setup != invoked_peratom)
-      || (update->eflag_global != invoked_peratom))
-    error->all(FLERR,"Energy was not tallied on needed timestep");
+  if ((did_setup != invoked_peratom) || (update->eflag_global != invoked_peratom))
+    error->all(FLERR, "Energy was not tallied on needed timestep");
 
   // collect contributions from ghost atoms
 
@@ -213,8 +204,7 @@ void ComputeForceTally::compute_peratom()
     // clear out ghost atom data after it has been collected to local atoms
     const int nall = atom->nlocal + atom->nghost;
     for (int i = atom->nlocal; i < nall; ++i)
-      for (int j = 0; j < size_peratom_cols; ++j)
-        fatom[i][j] = 0.0;
+      for (int j = 0; j < size_peratom_cols; ++j) fatom[i][j] = 0.0;
   }
 }
 
@@ -224,7 +214,6 @@ void ComputeForceTally::compute_peratom()
 
 double ComputeForceTally::memory_usage()
 {
-  double bytes = (nmax < 0) ? 0 : nmax*size_peratom_cols * sizeof(double);
+  double bytes = (nmax < 0) ? 0 : nmax * (double)size_peratom_cols * sizeof(double);
   return bytes;
 }
-
diff --git a/src/TALLY/compute_heat_flux_tally.cpp b/src/TALLY/compute_heat_flux_tally.cpp
index 96456bf775..7bff99eeb7 100644
--- a/src/TALLY/compute_heat_flux_tally.cpp
+++ b/src/TALLY/compute_heat_flux_tally.cpp
@@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
@@ -15,26 +14,25 @@
 #include "compute_heat_flux_tally.h"
 
 #include "atom.h"
-#include "group.h"
-#include "pair.h"
-#include "update.h"
-#include "memory.h"
+#include "comm.h"
 #include "error.h"
 #include "force.h"
-#include "comm.h"
+#include "group.h"
+#include "memory.h"
+#include "pair.h"
+#include "update.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 ComputeHeatFluxTally::ComputeHeatFluxTally(LAMMPS *lmp, int narg, char **arg) :
-  Compute(lmp, narg, arg)
+    Compute(lmp, narg, arg)
 {
-  if (narg < 4) error->all(FLERR,"Illegal compute heat/flux/tally command");
+  if (narg < 4) error->all(FLERR, "Illegal compute heat/flux/tally command");
 
   igroup2 = group->find(arg[3]);
-  if (igroup2 == -1)
-    error->all(FLERR,"Could not find compute heat/flux/tally second group ID");
+  if (igroup2 == -1) error->all(FLERR, "Could not find compute heat/flux/tally second group ID");
   groupbit2 = group->bitmask[igroup2];
 
   vector_flag = 1;
@@ -44,7 +42,7 @@ ComputeHeatFluxTally::ComputeHeatFluxTally(LAMMPS *lmp, int narg, char **arg) :
   comm_reverse = 7;
   extvector = 1;
   size_vector = 6;
-  peflag = 1;                   // we need Pair::ev_tally() to be run
+  peflag = 1;    // we need Pair::ev_tally() to be run
 
   did_setup = 0;
   invoked_peratom = invoked_scalar = -1;
@@ -71,17 +69,16 @@ ComputeHeatFluxTally::~ComputeHeatFluxTally()
 void ComputeHeatFluxTally::init()
 {
   if (force->pair == nullptr)
-    error->all(FLERR,"Trying to use compute heat/flux/tally without pair style");
+    error->all(FLERR, "Trying to use compute heat/flux/tally without pair style");
   else
     force->pair->add_tally_callback(this);
 
   if (comm->me == 0) {
     if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-      error->warning(FLERR,"Compute heat/flux/tally used with incompatible pair style");
+      error->warning(FLERR, "Compute heat/flux/tally used with incompatible pair style");
 
-    if (force->bond || force->angle || force->dihedral
-                    || force->improper || force->kspace)
-      error->warning(FLERR,"Compute heat/flux/tally only called from pair style");
+    if (force->bond || force->angle || force->dihedral || force->improper || force->kspace)
+      error->warning(FLERR, "Compute heat/flux/tally only called from pair style");
   }
   did_setup = -1;
 }
@@ -102,13 +99,13 @@ void ComputeHeatFluxTally::pair_setup_callback(int, int)
     memory->destroy(stress);
     memory->destroy(eatom);
     nmax = atom->nmax;
-    memory->create(stress,nmax,6,"heat/flux/tally:stress");
-    memory->create(eatom,nmax,"heat/flux/tally:eatom");
+    memory->create(stress, nmax, 6, "heat/flux/tally:stress");
+    memory->create(eatom, nmax, "heat/flux/tally:eatom");
   }
 
   // clear storage
 
-  for (int i=0; i < ntotal; ++i) {
+  for (int i = 0; i < ntotal; ++i) {
     eatom[i] = 0.0;
     stress[i][0] = 0.0;
     stress[i][1] = 0.0;
@@ -118,30 +115,29 @@ void ComputeHeatFluxTally::pair_setup_callback(int, int)
     stress[i][5] = 0.0;
   }
 
-  for (int i=0; i < size_vector; ++i)
-    vector[i] = heatj[i] = 0.0;
+  for (int i = 0; i < size_vector; ++i) vector[i] = heatj[i] = 0.0;
 
   did_setup = update->ntimestep;
 }
 
 /* ---------------------------------------------------------------------- */
-void ComputeHeatFluxTally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                             double evdwl, double ecoul, double fpair,
-                                             double dx, double dy, double dz)
+void ComputeHeatFluxTally::pair_tally_callback(int i, int j, int nlocal, int newton, double evdwl,
+                                               double ecoul, double fpair, double dx, double dy,
+                                               double dz)
 {
-  const int * const mask = atom->mask;
+  const int *const mask = atom->mask;
 
-  if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
-       || ((mask[i] & groupbit2) && (mask[j] & groupbit))) {
+  if (((mask[i] & groupbit) && (mask[j] & groupbit2)) ||
+      ((mask[i] & groupbit2) && (mask[j] & groupbit))) {
 
     const double epairhalf = 0.5 * (evdwl + ecoul);
     fpair *= 0.5;
-    const double v0 = dx*dx*fpair;  // dx*fpair = Fij_x
-    const double v1 = dy*dy*fpair;
-    const double v2 = dz*dz*fpair;
-    const double v3 = dx*dy*fpair;
-    const double v4 = dx*dz*fpair;
-    const double v5 = dy*dz*fpair;
+    const double v0 = dx * dx * fpair;    // dx*fpair = Fij_x
+    const double v1 = dy * dy * fpair;
+    const double v2 = dz * dz * fpair;
+    const double v3 = dx * dy * fpair;
+    const double v4 = dx * dz * fpair;
+    const double v5 = dy * dz * fpair;
 
     if (newton || i < nlocal) {
       eatom[i] += epairhalf;
@@ -168,7 +164,7 @@ void ComputeHeatFluxTally::pair_tally_callback(int i, int j, int nlocal, int new
 
 int ComputeHeatFluxTally::pack_reverse_comm(int n, int first, double *buf)
 {
-  int i,m,last;
+  int i, m, last;
 
   m = 0;
   last = first + n;
@@ -188,7 +184,7 @@ int ComputeHeatFluxTally::pack_reverse_comm(int n, int first, double *buf)
 
 void ComputeHeatFluxTally::unpack_reverse_comm(int n, int *list, double *buf)
 {
-  int i,j,m;
+  int i, j, m;
 
   m = 0;
   for (i = 0; i < n; i++) {
@@ -209,7 +205,7 @@ void ComputeHeatFluxTally::compute_vector()
 {
   invoked_vector = update->ntimestep;
   if ((did_setup != invoked_vector) || (update->eflag_global != invoked_vector))
-    error->all(FLERR,"Energy was not tallied on needed timestep");
+    error->all(FLERR, "Energy was not tallied on needed timestep");
 
   // collect contributions from ghost atoms
 
@@ -244,26 +240,28 @@ void ComputeHeatFluxTally::compute_vector()
   double *rmass = atom->rmass;
   int *type = atom->type;
 
-  double jc[3] = {0.0,0.0,0.0};
-  double jv[3] = {0.0,0.0,0.0};
+  double jc[3] = {0.0, 0.0, 0.0};
+  double jv[3] = {0.0, 0.0, 0.0};
 
   for (int i = 0; i < nlocal; i++) {
     if (mask[i] & groupbit) {
-      const double * const vi = v[i];
-      const double * const si = stress[i];
+      const double *const vi = v[i];
+      const double *const si = stress[i];
       double ke_i;
 
-      if (rmass) ke_i = pfactor * rmass[i];
-      else ke_i = pfactor * mass[type[i]];
-      ke_i *= (vi[0]*vi[0] + vi[1]*vi[1] + vi[2]*vi[2]);
+      if (rmass)
+        ke_i = pfactor * rmass[i];
+      else
+        ke_i = pfactor * mass[type[i]];
+      ke_i *= (vi[0] * vi[0] + vi[1] * vi[1] + vi[2] * vi[2]);
       ke_i += eatom[i];
 
-      jc[0] += ke_i*vi[0];
-      jc[1] += ke_i*vi[1];
-      jc[2] += ke_i*vi[2];
-      jv[0] += si[0]*vi[0] + si[3]*vi[1] + si[4]*vi[2];
-      jv[1] += si[3]*vi[0] + si[1]*vi[1] + si[5]*vi[2];
-      jv[2] += si[4]*vi[0] + si[5]*vi[1] + si[2]*vi[2];
+      jc[0] += ke_i * vi[0];
+      jc[1] += ke_i * vi[1];
+      jc[2] += ke_i * vi[2];
+      jv[0] += si[0] * vi[0] + si[3] * vi[1] + si[4] * vi[2];
+      jv[1] += si[3] * vi[0] + si[1] * vi[1] + si[5] * vi[2];
+      jv[2] += si[4] * vi[0] + si[5] * vi[1] + si[2] * vi[2];
     }
   }
 
@@ -274,7 +272,7 @@ void ComputeHeatFluxTally::compute_vector()
   heatj[3] = jc[0];
   heatj[4] = jc[1];
   heatj[5] = jc[2];
-  MPI_Allreduce(heatj,vector,size_vector,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(heatj, vector, size_vector, MPI_DOUBLE, MPI_SUM, world);
 }
 
 /* ----------------------------------------------------------------------
@@ -283,7 +281,6 @@ void ComputeHeatFluxTally::compute_vector()
 
 double ComputeHeatFluxTally::memory_usage()
 {
-  double bytes = (nmax < 0) ? 0 : nmax*comm_reverse * sizeof(double);
+  double bytes = (nmax < 0) ? 0 : nmax * (double)comm_reverse * sizeof(double);
   return bytes;
 }
-
diff --git a/src/TALLY/compute_heat_flux_virial_tally.cpp b/src/TALLY/compute_heat_flux_virial_tally.cpp
index 1a594c1b36..8605b9c546 100644
--- a/src/TALLY/compute_heat_flux_virial_tally.cpp
+++ b/src/TALLY/compute_heat_flux_virial_tally.cpp
@@ -233,6 +233,6 @@ void ComputeHeatFluxVirialTally::compute_peratom()
 
 double ComputeHeatFluxVirialTally::memory_usage()
 {
-  double bytes = (nmax < 0) ? 0 : nmax * size_peratom_cols * sizeof(double);
+  double bytes = (nmax < 0) ? 0 : nmax * (double)size_peratom_cols * sizeof(double);
   return bytes;
 }
diff --git a/src/TALLY/compute_pe_mol_tally.cpp b/src/TALLY/compute_pe_mol_tally.cpp
index 329c9ff429..fc3efb272b 100644
--- a/src/TALLY/compute_pe_mol_tally.cpp
+++ b/src/TALLY/compute_pe_mol_tally.cpp
@@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
@@ -15,25 +14,23 @@
 #include "compute_pe_mol_tally.h"
 
 #include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
 #include "group.h"
 #include "pair.h"
 #include "update.h"
-#include "error.h"
-#include "force.h"
-#include "comm.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-ComputePEMolTally::ComputePEMolTally(LAMMPS *lmp, int narg, char **arg) :
-  Compute(lmp, narg, arg)
+ComputePEMolTally::ComputePEMolTally(LAMMPS *lmp, int narg, char **arg) : Compute(lmp, narg, arg)
 {
-  if (narg < 4) error->all(FLERR,"Illegal compute pe/mol/tally command");
+  if (narg < 4) error->all(FLERR, "Illegal compute pe/mol/tally command");
 
   igroup2 = group->find(arg[3]);
-  if (igroup2 == -1)
-    error->all(FLERR,"Could not find compute pe/mol/tally second group ID");
+  if (igroup2 == -1) error->all(FLERR, "Could not find compute pe/mol/tally second group ID");
   groupbit2 = group->bitmask[igroup2];
 
   vector_flag = 1;
@@ -42,7 +39,7 @@ ComputePEMolTally::ComputePEMolTally(LAMMPS *lmp, int narg, char **arg) :
   dynamic_group_allow = 0;
 
   extvector = 1;
-  peflag = 1;                   // we need Pair::ev_tally() to be run
+  peflag = 1;    // we need Pair::ev_tally() to be run
 
   did_setup = invoked_vector = -1;
   vector = new double[size_vector];
@@ -61,20 +58,18 @@ ComputePEMolTally::~ComputePEMolTally()
 void ComputePEMolTally::init()
 {
   if (force->pair == nullptr)
-    error->all(FLERR,"Trying to use compute pe/mol/tally without pair style");
+    error->all(FLERR, "Trying to use compute pe/mol/tally without pair style");
   else
     force->pair->add_tally_callback(this);
 
-  if (atom->molecule_flag == 0)
-    error->all(FLERR,"Compute pe/mol/tally requires molecule IDs");
+  if (atom->molecule_flag == 0) error->all(FLERR, "Compute pe/mol/tally requires molecule IDs");
 
   if (comm->me == 0) {
     if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-      error->warning(FLERR,"Compute pe/mol/tally used with incompatible pair style");
+      error->warning(FLERR, "Compute pe/mol/tally used with incompatible pair style");
 
-    if (force->bond || force->angle || force->dihedral
-                    || force->improper || force->kspace)
-      error->warning(FLERR,"Compute pe/mol/tally only called from pair style");
+    if (force->bond || force->angle || force->dihedral || force->improper || force->kspace)
+      error->warning(FLERR, "Compute pe/mol/tally only called from pair style");
   }
   did_setup = -1;
 }
@@ -93,29 +88,33 @@ void ComputePEMolTally::pair_setup_callback(int, int)
 }
 
 /* ---------------------------------------------------------------------- */
-void ComputePEMolTally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                         double evdwl, double ecoul, double,
-                                         double, double, double)
+void ComputePEMolTally::pair_tally_callback(int i, int j, int nlocal, int newton, double evdwl,
+                                            double ecoul, double, double, double, double)
 {
-  const int * const mask = atom->mask;
-  const tagint * const molid = atom->molecule;
+  const int *const mask = atom->mask;
+  const tagint *const molid = atom->molecule;
 
-  if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
-     || ((mask[i] & groupbit2) && (mask[j] & groupbit))) {
+  if (((mask[i] & groupbit) && (mask[j] & groupbit2)) ||
+      ((mask[i] & groupbit2) && (mask[j] & groupbit))) {
 
-    evdwl *= 0.5; ecoul *= 0.5;
+    evdwl *= 0.5;
+    ecoul *= 0.5;
     if (newton || i < nlocal) {
       if (molid[i] == molid[j]) {
-        etotal[0] += evdwl; etotal[1] += ecoul;
+        etotal[0] += evdwl;
+        etotal[1] += ecoul;
       } else {
-        etotal[2] += evdwl; etotal[3] += ecoul;
+        etotal[2] += evdwl;
+        etotal[3] += ecoul;
       }
     }
     if (newton || j < nlocal) {
       if (molid[i] == molid[j]) {
-        etotal[0] += evdwl; etotal[1] += ecoul;
+        etotal[0] += evdwl;
+        etotal[1] += ecoul;
       } else {
-        etotal[2] += evdwl; etotal[3] += ecoul;
+        etotal[2] += evdwl;
+        etotal[3] += ecoul;
       }
     }
   }
@@ -127,10 +126,9 @@ void ComputePEMolTally::compute_vector()
 {
   invoked_vector = update->ntimestep;
   if ((did_setup != invoked_vector) || (update->eflag_global != invoked_vector))
-    error->all(FLERR,"Energy was not tallied on needed timestep");
+    error->all(FLERR, "Energy was not tallied on needed timestep");
 
   // sum accumulated energies across procs
 
-  MPI_Allreduce(etotal,vector,size_vector,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(etotal, vector, size_vector, MPI_DOUBLE, MPI_SUM, world);
 }
-
diff --git a/src/TALLY/compute_pe_tally.cpp b/src/TALLY/compute_pe_tally.cpp
index dae233f830..07cb500e44 100644
--- a/src/TALLY/compute_pe_tally.cpp
+++ b/src/TALLY/compute_pe_tally.cpp
@@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
@@ -15,26 +14,24 @@
 #include "compute_pe_tally.h"
 
 #include "atom.h"
-#include "group.h"
-#include "pair.h"
-#include "update.h"
-#include "memory.h"
+#include "comm.h"
 #include "error.h"
 #include "force.h"
-#include "comm.h"
+#include "group.h"
+#include "memory.h"
+#include "pair.h"
+#include "update.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-ComputePETally::ComputePETally(LAMMPS *lmp, int narg, char **arg) :
-  Compute(lmp, narg, arg)
+ComputePETally::ComputePETally(LAMMPS *lmp, int narg, char **arg) : Compute(lmp, narg, arg)
 {
-  if (narg < 4) error->all(FLERR,"Illegal compute pe/tally command");
+  if (narg < 4) error->all(FLERR, "Illegal compute pe/tally command");
 
   igroup2 = group->find(arg[3]);
-  if (igroup2 == -1)
-    error->all(FLERR,"Could not find compute pe/tally second group ID");
+  if (igroup2 == -1) error->all(FLERR, "Could not find compute pe/tally second group ID");
   groupbit2 = group->bitmask[igroup2];
 
   scalar_flag = 1;
@@ -45,7 +42,7 @@ ComputePETally::ComputePETally(LAMMPS *lmp, int narg, char **arg) :
 
   comm_reverse = size_peratom_cols = 2;
   extscalar = 1;
-  peflag = 1;                   // we need Pair::ev_tally() to be run
+  peflag = 1;    // we need Pair::ev_tally() to be run
 
   did_setup = invoked_peratom = invoked_scalar = -1;
   nmax = -1;
@@ -67,17 +64,16 @@ ComputePETally::~ComputePETally()
 void ComputePETally::init()
 {
   if (force->pair == nullptr)
-    error->all(FLERR,"Trying to use compute pe/tally without a pair style");
+    error->all(FLERR, "Trying to use compute pe/tally without a pair style");
   else
     force->pair->add_tally_callback(this);
 
   if (comm->me == 0) {
     if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-      error->warning(FLERR,"Compute pe/tally used with incompatible pair style");
+      error->warning(FLERR, "Compute pe/tally used with incompatible pair style");
 
-    if (force->bond || force->angle || force->dihedral
-                    || force->improper || force->kspace)
-      error->warning(FLERR,"Compute pe/tally only called from pair style");
+    if (force->bond || force->angle || force->dihedral || force->improper || force->kspace)
+      error->warning(FLERR, "Compute pe/tally only called from pair style");
   }
   did_setup = -1;
 }
@@ -98,14 +94,13 @@ void ComputePETally::pair_setup_callback(int, int)
   if (atom->nmax > nmax) {
     memory->destroy(eatom);
     nmax = atom->nmax;
-    memory->create(eatom,nmax,size_peratom_cols,"pe/tally:eatom");
+    memory->create(eatom, nmax, size_peratom_cols, "pe/tally:eatom");
     array_atom = eatom;
   }
 
   // clear storage
 
-  for (int i=0; i < ntotal; ++i)
-    eatom[i][0] = eatom[i][1] = 0.0;
+  for (int i = 0; i < ntotal; ++i) eatom[i][0] = eatom[i][1] = 0.0;
 
   vector[0] = etotal[0] = vector[1] = etotal[1] = 0.0;
 
@@ -113,23 +108,27 @@ void ComputePETally::pair_setup_callback(int, int)
 }
 
 /* ---------------------------------------------------------------------- */
-void ComputePETally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                         double evdwl, double ecoul, double,
-                                         double, double, double)
+void ComputePETally::pair_tally_callback(int i, int j, int nlocal, int newton, double evdwl,
+                                         double ecoul, double, double, double, double)
 {
-  const int * const mask = atom->mask;
+  const int *const mask = atom->mask;
 
-  if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
-       || ((mask[i] & groupbit2) && (mask[j] & groupbit))) {
+  if (((mask[i] & groupbit) && (mask[j] & groupbit2)) ||
+      ((mask[i] & groupbit2) && (mask[j] & groupbit))) {
 
-    evdwl *= 0.5; ecoul *= 0.5;
+    evdwl *= 0.5;
+    ecoul *= 0.5;
     if (newton || i < nlocal) {
-      etotal[0] += evdwl; eatom[i][0] += evdwl;
-      etotal[1] += ecoul; eatom[i][1] += ecoul;
+      etotal[0] += evdwl;
+      eatom[i][0] += evdwl;
+      etotal[1] += ecoul;
+      eatom[i][1] += ecoul;
     }
     if (newton || j < nlocal) {
-      etotal[0] += evdwl; eatom[j][0] += evdwl;
-      etotal[1] += ecoul; eatom[j][1] += ecoul;
+      etotal[0] += evdwl;
+      eatom[j][0] += evdwl;
+      etotal[1] += ecoul;
+      eatom[j][1] += ecoul;
     }
   }
 }
@@ -138,7 +137,7 @@ void ComputePETally::pair_tally_callback(int i, int j, int nlocal, int newton,
 
 int ComputePETally::pack_reverse_comm(int n, int first, double *buf)
 {
-  int i,m,last;
+  int i, m, last;
 
   m = 0;
   last = first + n;
@@ -153,7 +152,7 @@ int ComputePETally::pack_reverse_comm(int n, int first, double *buf)
 
 void ComputePETally::unpack_reverse_comm(int n, int *list, double *buf)
 {
-  int i,j,m;
+  int i, j, m;
 
   m = 0;
   for (i = 0; i < n; i++) {
@@ -168,15 +167,14 @@ void ComputePETally::unpack_reverse_comm(int n, int *list, double *buf)
 double ComputePETally::compute_scalar()
 {
   invoked_scalar = update->ntimestep;
-  if ((did_setup != invoked_scalar)
-      || (update->eflag_global != invoked_scalar))
-    error->all(FLERR,"Energy was not tallied on needed timestep");
+  if ((did_setup != invoked_scalar) || (update->eflag_global != invoked_scalar))
+    error->all(FLERR, "Energy was not tallied on needed timestep");
 
   // sum accumulated energies across procs
 
-  MPI_Allreduce(etotal,vector,size_peratom_cols,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(etotal, vector, size_peratom_cols, MPI_DOUBLE, MPI_SUM, world);
 
-  scalar = vector[0]+vector[1];
+  scalar = vector[0] + vector[1];
   return scalar;
 }
 
@@ -185,9 +183,8 @@ double ComputePETally::compute_scalar()
 void ComputePETally::compute_peratom()
 {
   invoked_peratom = update->ntimestep;
-  if ((did_setup != invoked_peratom)
-      || (update->eflag_global != invoked_peratom))
-    error->all(FLERR,"Energy was not tallied on needed timestep");
+  if ((did_setup != invoked_peratom) || (update->eflag_global != invoked_peratom))
+    error->all(FLERR, "Energy was not tallied on needed timestep");
 
   // collect contributions from ghost atoms
 
@@ -196,8 +193,7 @@ void ComputePETally::compute_peratom()
 
     // clear out ghost atom data after it has been collected to local atoms
     const int nall = atom->nlocal + atom->nghost;
-    for (int i = atom->nlocal; i < nall; ++i)
-      eatom[i][0] = eatom[i][1] = 0.0;
+    for (int i = atom->nlocal; i < nall; ++i) eatom[i][0] = eatom[i][1] = 0.0;
   }
 }
 
@@ -207,7 +203,6 @@ void ComputePETally::compute_peratom()
 
 double ComputePETally::memory_usage()
 {
-  double bytes = (nmax < 0) ? 0 : nmax*size_peratom_cols * sizeof(double);
+  double bytes = (nmax < 0) ? 0 : nmax * (double)size_peratom_cols * sizeof(double);
   return bytes;
 }
-
diff --git a/src/TALLY/compute_stress_tally.cpp b/src/TALLY/compute_stress_tally.cpp
index 8ae42ddf02..dea65ade26 100644
--- a/src/TALLY/compute_stress_tally.cpp
+++ b/src/TALLY/compute_stress_tally.cpp
@@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
@@ -15,27 +14,25 @@
 #include "compute_stress_tally.h"
 
 #include "atom.h"
-#include "group.h"
-#include "pair.h"
-#include "update.h"
-#include "memory.h"
-#include "error.h"
-#include "force.h"
 #include "comm.h"
 #include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "group.h"
+#include "memory.h"
+#include "pair.h"
+#include "update.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-ComputeStressTally::ComputeStressTally(LAMMPS *lmp, int narg, char **arg) :
-  Compute(lmp, narg, arg)
+ComputeStressTally::ComputeStressTally(LAMMPS *lmp, int narg, char **arg) : Compute(lmp, narg, arg)
 {
-  if (narg < 4) error->all(FLERR,"Illegal compute stress/tally command");
+  if (narg < 4) error->all(FLERR, "Illegal compute stress/tally command");
 
   igroup2 = group->find(arg[3]);
-  if (igroup2 == -1)
-    error->all(FLERR,"Could not find compute stress/tally second group ID");
+  if (igroup2 == -1) error->all(FLERR, "Could not find compute stress/tally second group ID");
   groupbit2 = group->bitmask[igroup2];
 
   scalar_flag = 1;
@@ -46,7 +43,7 @@ ComputeStressTally::ComputeStressTally(LAMMPS *lmp, int narg, char **arg) :
 
   comm_reverse = size_peratom_cols = 6;
   extscalar = 0;
-  peflag = 1;                   // we need Pair::ev_tally() to be run
+  peflag = 1;    // we need Pair::ev_tally() to be run
 
   did_setup = invoked_peratom = invoked_scalar = -1;
   nmax = -1;
@@ -70,17 +67,16 @@ ComputeStressTally::~ComputeStressTally()
 void ComputeStressTally::init()
 {
   if (force->pair == nullptr)
-    error->all(FLERR,"Trying to use compute stress/tally without pair style");
+    error->all(FLERR, "Trying to use compute stress/tally without pair style");
   else
     force->pair->add_tally_callback(this);
 
   if (comm->me == 0) {
     if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-      error->warning(FLERR,"Compute stress/tally used with incompatible pair style");
+      error->warning(FLERR, "Compute stress/tally used with incompatible pair style");
 
-    if (force->bond || force->angle || force->dihedral
-                    || force->improper || force->kspace)
-      error->warning(FLERR,"Compute stress/tally only called from pair style");
+    if (force->bond || force->angle || force->dihedral || force->improper || force->kspace)
+      error->warning(FLERR, "Compute stress/tally only called from pair style");
   }
   did_setup = -1;
 }
@@ -101,55 +97,64 @@ void ComputeStressTally::pair_setup_callback(int, int)
   if (atom->nmax > nmax) {
     memory->destroy(stress);
     nmax = atom->nmax;
-    memory->create(stress,nmax,size_peratom_cols,"stress/tally:stress");
+    memory->create(stress, nmax, size_peratom_cols, "stress/tally:stress");
     array_atom = stress;
   }
 
   // clear storage
 
-  for (int i=0; i < ntotal; ++i)
-    for (int j=0; j < size_peratom_cols; ++j)
-      stress[i][j] = 0.0;
+  for (int i = 0; i < ntotal; ++i)
+    for (int j = 0; j < size_peratom_cols; ++j) stress[i][j] = 0.0;
 
-  for (int i=0; i < size_peratom_cols; ++i)
-    vector[i] = virial[i] = 0.0;
+  for (int i = 0; i < size_peratom_cols; ++i) vector[i] = virial[i] = 0.0;
 
   did_setup = update->ntimestep;
 }
 
 /* ---------------------------------------------------------------------- */
-void ComputeStressTally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                             double, double, double fpair,
-                                             double dx, double dy, double dz)
+void ComputeStressTally::pair_tally_callback(int i, int j, int nlocal, int newton, double, double,
+                                             double fpair, double dx, double dy, double dz)
 {
-  const int * const mask = atom->mask;
+  const int *const mask = atom->mask;
 
-  if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
-       || ((mask[i] & groupbit2) && (mask[j] & groupbit))) {
+  if (((mask[i] & groupbit) && (mask[j] & groupbit2)) ||
+      ((mask[i] & groupbit2) && (mask[j] & groupbit))) {
 
     fpair *= 0.5;
-    const double v0 = dx*dx*fpair;
-    const double v1 = dy*dy*fpair;
-    const double v2 = dz*dz*fpair;
-    const double v3 = dx*dy*fpair;
-    const double v4 = dx*dz*fpair;
-    const double v5 = dy*dz*fpair;
+    const double v0 = dx * dx * fpair;
+    const double v1 = dy * dy * fpair;
+    const double v2 = dz * dz * fpair;
+    const double v3 = dx * dy * fpair;
+    const double v4 = dx * dz * fpair;
+    const double v5 = dy * dz * fpair;
 
     if (newton || i < nlocal) {
-      virial[0] += v0; stress[i][0] += v0;
-      virial[1] += v1; stress[i][1] += v1;
-      virial[2] += v2; stress[i][2] += v2;
-      virial[3] += v3; stress[i][3] += v3;
-      virial[4] += v4; stress[i][4] += v4;
-      virial[5] += v5; stress[i][5] += v5;
+      virial[0] += v0;
+      stress[i][0] += v0;
+      virial[1] += v1;
+      stress[i][1] += v1;
+      virial[2] += v2;
+      stress[i][2] += v2;
+      virial[3] += v3;
+      stress[i][3] += v3;
+      virial[4] += v4;
+      stress[i][4] += v4;
+      virial[5] += v5;
+      stress[i][5] += v5;
     }
     if (newton || j < nlocal) {
-      virial[0] += v0; stress[j][0] += v0;
-      virial[1] += v1; stress[j][1] += v1;
-      virial[2] += v2; stress[j][2] += v2;
-      virial[3] += v3; stress[j][3] += v3;
-      virial[4] += v4; stress[j][4] += v4;
-      virial[5] += v5; stress[j][5] += v5;
+      virial[0] += v0;
+      stress[j][0] += v0;
+      virial[1] += v1;
+      stress[j][1] += v1;
+      virial[2] += v2;
+      stress[j][2] += v2;
+      virial[3] += v3;
+      stress[j][3] += v3;
+      virial[4] += v4;
+      stress[j][4] += v4;
+      virial[5] += v5;
+      stress[j][5] += v5;
     }
   }
 }
@@ -158,7 +163,7 @@ void ComputeStressTally::pair_tally_callback(int i, int j, int nlocal, int newto
 
 int ComputeStressTally::pack_reverse_comm(int n, int first, double *buf)
 {
-  int i,m,last;
+  int i, m, last;
 
   m = 0;
   last = first + n;
@@ -177,7 +182,7 @@ int ComputeStressTally::pack_reverse_comm(int n, int first, double *buf)
 
 void ComputeStressTally::unpack_reverse_comm(int n, int *list, double *buf)
 {
-  int i,j,m;
+  int i, j, m;
 
   m = 0;
   for (i = 0; i < n; i++) {
@@ -196,18 +201,17 @@ void ComputeStressTally::unpack_reverse_comm(int n, int *list, double *buf)
 double ComputeStressTally::compute_scalar()
 {
   invoked_scalar = update->ntimestep;
-  if ((did_setup != invoked_scalar)
-      || (update->eflag_global != invoked_scalar))
-    error->all(FLERR,"Energy was not tallied on needed timestep");
+  if ((did_setup != invoked_scalar) || (update->eflag_global != invoked_scalar))
+    error->all(FLERR, "Energy was not tallied on needed timestep");
 
   // sum accumulated forces across procs
 
-  MPI_Allreduce(virial,vector,size_peratom_cols,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(virial, vector, size_peratom_cols, MPI_DOUBLE, MPI_SUM, world);
 
   if (domain->dimension == 3)
-    scalar = (vector[0]+vector[1]+vector[2])/3.0;
+    scalar = (vector[0] + vector[1] + vector[2]) / 3.0;
   else
-    scalar = (vector[0]+vector[1])/2.0;
+    scalar = (vector[0] + vector[1]) / 2.0;
 
   return scalar;
 }
@@ -217,9 +221,8 @@ double ComputeStressTally::compute_scalar()
 void ComputeStressTally::compute_peratom()
 {
   invoked_peratom = update->ntimestep;
-  if ((did_setup != invoked_peratom)
-      || (update->eflag_global != invoked_peratom))
-    error->all(FLERR,"Energy was not tallied on needed timestep");
+  if ((did_setup != invoked_peratom) || (update->eflag_global != invoked_peratom))
+    error->all(FLERR, "Energy was not tallied on needed timestep");
 
   // collect contributions from ghost atoms
 
@@ -228,8 +231,7 @@ void ComputeStressTally::compute_peratom()
 
     const int nall = atom->nlocal + atom->nghost;
     for (int i = atom->nlocal; i < nall; ++i)
-      for (int j = 0; j < size_peratom_cols; ++j)
-        stress[i][j] = 0.0;
+      for (int j = 0; j < size_peratom_cols; ++j) stress[i][j] = 0.0;
   }
 
   // convert to stress*volume units = -pressure*volume
@@ -251,7 +253,6 @@ void ComputeStressTally::compute_peratom()
 
 double ComputeStressTally::memory_usage()
 {
-  double bytes = (nmax < 0) ? 0 : nmax*size_peratom_cols * sizeof(double);
+  double bytes = (nmax < 0) ? 0 : nmax * (double)size_peratom_cols * sizeof(double);
   return bytes;
 }
-