Merge branch 'master' of github.com:lammps/lammps into kk_verlet

2021-07-27 14:28:54 -06:00
parent dbe14d9ee7 3ff096e517
commit e47bd5c490
75 changed files with 1346 additions and 528 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -244,15 +244,16 @@ if(PKG_ADIOS)
 endif()

 if(NOT CMAKE_CROSSCOMPILING)
-  set(MPI_CXX_SKIP_MPICXX TRUE)
  find_package(MPI QUIET)
  option(BUILD_MPI "Build MPI version" ${MPI_FOUND})
 else()
-  set(MPI_CXX_SKIP_MPICXX TRUE)
  option(BUILD_MPI "Build MPI version" OFF)
 endif()

 if(BUILD_MPI)
+  # do not include the (obsolete) MPI C++ bindings which makes
+  # for leaner object files and avoids namespace conflicts
+  set(MPI_CXX_SKIP_MPICXX TRUE)
  # We use a non-standard procedure to cross-compile with MPI on Windows
  if((CMAKE_SYSTEM_NAME STREQUAL "Windows") AND CMAKE_CROSSCOMPILING)
    include(MPI4WIN)
@ -368,6 +369,8 @@ if(PKG_MSCG OR PKG_ATC OR PKG_AWPMD OR PKG_ML-QUIP OR PKG_LATTE)
  endif()
 endif()

+# tweak jpeg library names to avoid linker errors with MinGW cross-compilation
+set(JPEG_NAMES libjpeg libjpeg-62)
 find_package(JPEG QUIET)
 option(WITH_JPEG "Enable JPEG support" ${JPEG_FOUND})
 if(WITH_JPEG)
--- a/cmake/Modules/Packages/PLUMED.cmake
+++ b/cmake/Modules/Packages/PLUMED.cmake
@ -54,8 +54,8 @@ if(DOWNLOAD_PLUMED)
    set(PLUMED_BUILD_BYPRODUCTS "<INSTALL_DIR>/lib/libplumedWrapper.a")
  endif()

-  set(PLUMED_URL "https://github.com/plumed/plumed2/releases/download/v2.7.1/plumed-src-2.7.1.tgz" CACHE STRING "URL for PLUMED tarball")
-  set(PLUMED_MD5 "4eac6a462ec84dfe0cec96c82421b8e8" CACHE STRING "MD5 checksum of PLUMED tarball")
+  set(PLUMED_URL "https://github.com/plumed/plumed2/releases/download/v2.7.2/plumed-src-2.7.2.tgz" CACHE STRING "URL for PLUMED tarball")
+  set(PLUMED_MD5 "cfa0b4dd90a81c25d3302e8d97bfeaea" CACHE STRING "MD5 checksum of PLUMED tarball")

  mark_as_advanced(PLUMED_URL)
  mark_as_advanced(PLUMED_MD5)
@ -72,7 +72,6 @@ if(DOWNLOAD_PLUMED)
                                             ${PLUMED_CONFIG_OMP}
                                             CXX=${PLUMED_CONFIG_CXX}
                                             CC=${PLUMED_CONFIG_CC}
-    PATCH_COMMAND sed -i "/^#include <algorithm>/a #include <limits>" <SOURCE_DIR>/src/lepton/Operation.h
    BUILD_BYPRODUCTS ${PLUMED_BUILD_BYPRODUCTS}
  )
  ExternalProject_get_property(plumed_build INSTALL_DIR)
--- a/doc/src/Howto_viz.rst
+++ b/doc/src/Howto_viz.rst
@ -25,7 +25,7 @@ RasMol visualization programs.  Pizza.py has tools that do interactive
 3d OpenGL visualization and one that creates SVG images of dump file
 snapshots.

-.. _pizza: https://pizza.sandia.gov
+.. _pizza: https://lammps.github.io/pizza

 .. _ensight: https://www.ansys.com/products/fluids/ansys-ensight

--- a/doc/src/Intro_features.rst
+++ b/doc/src/Intro_features.rst
@ -24,11 +24,15 @@ General features
 ^^^^^^^^^^^^^^^^

 * runs on a single processor or in parallel
-* distributed-memory message-passing parallelism (MPI)
-* spatial-decomposition of simulation domain for parallelism
-* open-source distribution
-* highly portable C++
-* optional libraries used: MPI and single-processor FFT
+* distributed memory message-passing parallelism (MPI)
+* shared memory multi-threading parallelism (OpenMP)
+* spatial decomposition of simulation domain for MPI parallelism
+* particle decomposition inside of spatial decomposition for OpenMP parallelism
+* GPLv2 licensed open-source distribution
+* highly portable C++-11
+* modular code with most functionality in optional packages
+* only depends on MPI library for basic parallel functionality
+* other libraries are optional and only required for specific packages
 * GPU (CUDA and OpenCL), Intel Xeon Phi, and OpenMP support for many code features
 * easy to extend with new features and functionality
 * runs from an input script
@ -68,9 +72,9 @@ Interatomic potentials (force fields)
 :doc:`improper style <improper_style>`, :doc:`kspace style <kspace_style>`
 commands)

-* pairwise potentials: Lennard-Jones, Buckingham, Morse, Born-Mayer-Huggins,     Yukawa, soft, class 2 (COMPASS), hydrogen bond, tabulated
+* pairwise potentials: Lennard-Jones, Buckingham, Morse, Born-Mayer-Huggins, Yukawa, soft, class 2 (COMPASS), hydrogen bond, tabulated
 * charged pairwise potentials: Coulombic, point-dipole
-* many-body potentials: EAM, Finnis/Sinclair EAM, modified EAM (MEAM),     embedded ion method (EIM), EDIP, ADP, Stillinger-Weber, Tersoff,     REBO, AIREBO, ReaxFF, COMB, SNAP, Streitz-Mintmire, 3-body polymorphic
+* many-body potentials: EAM, Finnis/Sinclair EAM, modified EAM (MEAM), embedded ion method (EIM), EDIP, ADP, Stillinger-Weber, Tersoff, REBO, AIREBO, ReaxFF, COMB, SNAP, Streitz-Mintmire, 3-body polymorphic
 * long-range interactions for charge, point-dipoles, and LJ dispersion:     Ewald, Wolf, PPPM (similar to particle-mesh Ewald)
 * polarization models: :doc:`QEq <fix_qeq>`,     :doc:`core/shell model <Howto_coreshell>`,     :doc:`Drude dipole model <Howto_drude>`
 * charge equilibration (QEq via dynamic, point, shielded, Slater methods)
@ -170,9 +174,12 @@ Multi-replica models
 ^^^^^^^^^^^^^^^^^^^^

 * :doc:`nudged elastic band <neb>`
+* :doc:`hyperdynamics <hyper>`
 * :doc:`parallel replica dynamics <prd>`
 * :doc:`temperature accelerated dynamics <tad>`
 * :doc:`parallel tempering <temper>`
+* :doc:`path-integral MD <fix_pimd>`
+* multi-walker collective variables with :doc:`Colvars <fix_colvars>` and :doc:`Plumed <fix_plumed>`

 .. _prepost:

@ -187,7 +194,7 @@ Pre- and post-processing
  plotting, and visualization for LAMMPS simulations.  Pizza.py is
  written in `Python <python_>`_ and is available for download from `the Pizza.py WWW site <pizza_>`_.

-.. _pizza: https://pizza.sandia.gov
+.. _pizza: https://lammps.github.io/pizza

 .. _python: http://www.python.org

--- a/doc/src/Intro_nonfeatures.rst
+++ b/doc/src/Intro_nonfeatures.rst
@ -77,7 +77,7 @@ Here are suggestions on how to perform these tasks:
  it easier to analyze and plot.  See the :doc:`Tools <Tools>` doc page
  for more discussion of the various tools.
 * **Pizza.py:** Our group has also written a separate toolkit called
-  `Pizza.py <https://pizza.sandia.gov>`_ which can do certain kinds of
+  `Pizza.py <https://lammps.github.io/pizza>`_ which can do certain kinds of
  setup, analysis, plotting, and visualization (via OpenGL) for LAMMPS
  simulations.  It thus provides some functionality for several of the
  above bullets.  Pizza.py is written in `Python <http://www.python.org>`_
--- a/doc/src/Intro_overview.rst
+++ b/doc/src/Intro_overview.rst
@ -18,10 +18,11 @@ supercomputers.
 .. _mpi: https://en.wikipedia.org/wiki/Message_Passing_Interface
 .. _lws: https://www.lammps.org

-LAMMPS is written in C++.  Earlier versions were written in F77 and
-F90.  See the `History page <https://www.lammps.org/history.html>`_ of
-the website for details.  All versions can be downloaded from the
-`LAMMPS website <lws_>`_.
+LAMMPS is written in C++ and requires a compiler that is at least
+compatible with the C++-11 standard.
+Earlier versions were written in F77 and F90.  See the `History page
+<https://www.lammps.org/history.html>`_ of the website for details.  All
+versions can be downloaded from the `LAMMPS website <lws_>`_.

 LAMMPS is designed to be easy to modify or extend with new
 capabilities, such as new force fields, atom types, boundary
@ -41,8 +42,9 @@ short distances, so that the local density of particles never becomes
 too large.  This is in contrast to methods used for modeling plasma
 or gravitational bodies (e.g. galaxy formation).

-On parallel machines, LAMMPS uses spatial-decomposition techniques to
-partition the simulation domain into small sub-domains of equal
-computational cost, one of which is assigned to each processor.
-Processors communicate and store "ghost" atom information for atoms
-that border their sub-domain.
+On parallel machines, LAMMPS uses spatial-decomposition techniques with
+MPI parallelization to partition the simulation domain into small
+sub-domains of equal computational cost, one of which is assigned to
+each processor.  Processors communicate and store "ghost" atom
+information for atoms that border their sub-domain.  Multi-threading
+parallelization with with particle-decomposition can be used in addition.
--- a/doc/src/Python_examples.rst
+++ b/doc/src/Python_examples.rst
@ -35,9 +35,9 @@ visualization package you have installed.
 Note that for GL, you need to be able to run the Pizza.py GL tool,
 which is included in the pizza sub-directory.  See the Pizza.py doc pages for more info:

-* `https://pizza.sandia.gov <pizza_>`_
+* `https://lammps.github.io/pizza <pizza_>`_

-.. _pizza: https://pizza.sandia.gov
+.. _pizza: https://lammps.github.io/pizza

 Note that for AtomEye, you need version 3, and there is a line in the
 scripts that specifies the path and name of the executable.  See the
--- a/doc/src/Tools.rst
+++ b/doc/src/Tools.rst
@ -15,7 +15,7 @@ Sandia which provides tools for doing setup, analysis, plotting, and
 visualization for LAMMPS simulations.

 .. _lws: https://www.lammps.org
-.. _pizza: https://pizza.sandia.gov
+.. _pizza: https://lammps.github.io/pizza
 .. _python: https://www.python.org

 Additional tools included in the LAMMPS distribution are described on
--- a/doc/src/balance.rst
+++ b/doc/src/balance.rst
@ -558,7 +558,7 @@ Related commands
 :doc:`group <group>`, :doc:`processors <processors>`,
 :doc:`fix balance <fix_balance>`, :doc:`comm_style <comm_style>`

-.. _pizza: https://pizza.sandia.gov
+.. _pizza: https://lammps.github.io/pizza

 Default
 """""""
--- a/doc/src/compute_cluster_atom.rst
+++ b/doc/src/compute_cluster_atom.rst
@ -119,8 +119,7 @@ The per-atom vector values will be an ID > 0, as explained above.
 Restrictions
 """"""""""""

-These computes are part of the EXTRA-COMPUTE package.  They are only enabled if
-LAMMPS was built with that package.  See the :doc:`Build package <Build_package>` page for more info.
+none

 Related commands
 """"""""""""""""
--- a/doc/src/compute_orientorder_atom.rst
+++ b/doc/src/compute_orientorder_atom.rst
@ -182,8 +182,7 @@ page for an overview of LAMMPS output options.
 Restrictions
 """"""""""""

-This compute is part of the EXTRA-COMPUTE package.  It is only enabled if
-LAMMPS was built with that package.  See the :doc:`Build package <Build_package>` page for more info.
+none

 Related commands
 """"""""""""""""
--- a/doc/src/dump.rst
+++ b/doc/src/dump.rst
@ -230,7 +230,7 @@ individual values and the file itself.
 The *atom*, *local*, and *custom* styles create files in a simple text
 format that is self-explanatory when viewing a dump file.  Some of the
 LAMMPS post-processing tools described on the :doc:`Tools <Tools>` doc
-page, including `Pizza.py <https://pizza.sandia.gov>`_,
+page, including `Pizza.py <https://lammps.github.io/pizza>`_,
 work with this format, as does the :doc:`rerun <rerun>` command.

 For post-processing purposes the *atom*, *local*, and *custom* text
--- a/doc/src/dump_image.rst
+++ b/doc/src/dump_image.rst
@ -590,8 +590,8 @@ Play the movie:
     % mplayer foo.mpg
     % ffplay bar.avi

-* c) Use the `Pizza.py <https://pizza.sandia.gov>`_
-  `animate tool <https://pizza.sandia.gov/doc/animate.html>`_,
+* c) Use the `Pizza.py <https://lammps.github.io/pizza>`_
+  `animate tool <https://lammps.github.io/pizza/doc/animate.html>`_,
  which works directly on a series of image files.

  .. code-block:: python
--- a/doc/src/fix_balance.rst
+++ b/doc/src/fix_balance.rst
@ -403,7 +403,7 @@ Related commands
 :doc:`group <group>`, :doc:`processors <processors>`, :doc:`balance <balance>`,
 :doc:`comm_style <comm_style>`

-.. _pizza: https://pizza.sandia.gov
+.. _pizza: https://lammps.github.io/pizza

 Default
 """""""
--- a/doc/src/fix_neb.rst
+++ b/doc/src/fix_neb.rst
@ -89,7 +89,7 @@ first stage) is changed to:

 .. parsed-literal::

-   Fi = -Grad(V) + 2 (Grad(V) dot T') T'
+   Fi = -Grad(V) + 2 (Grad(V) dot T') T' + Fnudge_perp

 and the relaxation procedure is continued to a new converged MEP.

--- a/lib/plumed/Install.py
+++ b/lib/plumed/Install.py
@ -53,6 +53,7 @@ checksums = { \
        '2.6.3' : 'a9f8028fd74528c2024781ea1fdefeee', \
        '2.7.0' : '95f29dd0c067577f11972ff90dfc7d12', \
        '2.7.1' : '4eac6a462ec84dfe0cec96c82421b8e8', \
+        '2.7.2' : 'cfa0b4dd90a81c25d3302e8d97bfeaea', \
        }

 # parse and process arguments
--- a/python/lammps/core.py
+++ b/python/lammps/core.py
@ -1823,7 +1823,6 @@ class lammps(object):

    with ExceptionCheck(self):
      return self.lib.lammps_fix_external_get_force(self.lmp, fix_id.encode())
-    return None

  # -------------------------------------------------------------------------

--- a/src/DIELECTRIC/pppm_disp_dielectric.cpp
+++ b/src/DIELECTRIC/pppm_disp_dielectric.cpp
@ -647,7 +647,6 @@ void PPPMDispDielectric::fieldforce_c_ad()

    // convert E-field to force and substract self forces
    const double qfactor = qqrd2e * scale;
-    double qtmp = eps[i]*q[i];

    s1 = x[i][0]*hx_inv;
    s2 = x[i][1]*hy_inv;
@ -751,7 +750,7 @@ void PPPMDispDielectric::fieldforce_c_peratom()
   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
 ------------------------------------------------------------------------- */

-void PPPMDispDielectric::slabcorr(int eflag)
+void PPPMDispDielectric::slabcorr(int /*eflag*/)
 {
  // compute local contribution to global dipole moment

--- a/src/EXTRA-COMPUTE/compute_hma.cpp
+++ b/src/EXTRA-COMPUTE/compute_hma.cpp
@ -116,7 +116,7 @@ ComputeHMA::ComputeHMA(LAMMPS *lmp, int narg, char **arg) :
  computeU = computeP = computeCv = -1;
  returnAnharmonic = 0;
  size_vector = 0;
-  memory->create(extlist, 3, "hma:extlist");
+  extlist = new int[3];
  for (int iarg=4; iarg<narg; iarg++) {
    if (!strcmp(arg[iarg], "u")) {
      if (computeU>-1) continue;
@ -145,20 +145,11 @@ ComputeHMA::ComputeHMA(LAMMPS *lmp, int narg, char **arg) :
    }
  }

-  if (size_vector == 0) {
-    error->all(FLERR,"Illegal compute hma command");
-  }
-  if (size_vector<3) {
-    memory->grow(extlist, size_vector, "hma:extlist");
-  }
-  memory->create(vector, size_vector, "hma:vector");
+  if (size_vector == 0) error->all(FLERR,"Illegal compute hma command");
+  vector = new double[size_vector];

-  if (computeU>-1 || computeCv>-1) {
-    peflag = 1;
-  }
-  if (computeP>-1) {
-    pressflag = 1;
-  }
+  if (computeU>-1 || computeCv>-1) peflag = 1;
+  if (computeP>-1) pressflag = 1;

  nmax = 0;
 }
@ -170,10 +161,11 @@ ComputeHMA::~ComputeHMA()
  // check nfix in case all fixes have already been deleted
  if (modify->nfix) modify->delete_fix(id_fix);

-  delete [] id_fix;
-  delete [] id_temp;
-  memory->destroy(extlist);
-  memory->destroy(vector);
+  delete[] id_fix;
+  delete[] id_temp;
+  delete[] extlist;
+  delete[] vector;
+
  memory->destroy(deltaR);
 }

--- a/src/INTEL/angle_charmm_intel.cpp
+++ b/src/INTEL/angle_charmm_intel.cpp
@ -162,7 +162,11 @@ void AngleCharmmIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n ++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -246,7 +250,11 @@ void AngleCharmmIntel::eval(const int vflag,
      // apply force to each of 3 atoms

      #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/angle_harmonic_intel.cpp
+++ b/src/INTEL/angle_harmonic_intel.cpp
@ -162,7 +162,11 @@ void AngleHarmonicIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n ++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -228,7 +232,11 @@ void AngleHarmonicIntel::eval(const int vflag,
      // apply force to each of 3 atoms

      #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/bond_fene_intel.cpp
+++ b/src/INTEL/bond_fene_intel.cpp
@ -158,7 +158,11 @@ void BondFENEIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n ++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -215,7 +219,11 @@ void BondFENEIntel::eval(const int vflag,
      // apply force to each of 2 atoms

      #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/bond_harmonic_intel.cpp
+++ b/src/INTEL/bond_harmonic_intel.cpp
@ -155,7 +155,11 @@ void BondHarmonicIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n ++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -184,7 +188,11 @@ void BondHarmonicIntel::eval(const int vflag,

      // apply force to each of 2 atoms
      #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/dihedral_charmm_intel.cpp
+++ b/src/INTEL/dihedral_charmm_intel.cpp
@ -181,9 +181,16 @@ void DihedralCharmmIntel::eval(const int vflag,
    }

    #if defined(LMP_SIMD_COMPILER_TEST)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
+                               sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, \
+                               spv5)
+#else
    #pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
-                           sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5)
+                           sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, \
+                           spv5)
+#endif
+    #pragma vector aligned
    for (int n = nfrom; n < nto; n++) {
    #endif
    for (int n = nfrom; n < nto; n += npl) {
@ -329,7 +336,11 @@ void DihedralCharmmIntel::eval(const int vflag,


      #if defined(LMP_SIMD_COMPILER_TEST)
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i2 < nlocal) {
@ -408,7 +419,11 @@ void DihedralCharmmIntel::eval(const int vflag,

      // apply force to each of 4 atoms
      #if defined(LMP_SIMD_COMPILER_TEST)
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/dihedral_fourier_intel.cpp
+++ b/src/INTEL/dihedral_fourier_intel.cpp
@ -154,7 +154,11 @@ void DihedralFourierIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n ++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -304,7 +308,11 @@ void DihedralFourierIntel::eval(const int vflag,
      }

      #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/dihedral_harmonic_intel.cpp
+++ b/src/INTEL/dihedral_harmonic_intel.cpp
@ -154,7 +154,11 @@ void DihedralHarmonicIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n ++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -299,7 +303,11 @@ void DihedralHarmonicIntel::eval(const int vflag,
      }

      #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/dihedral_opls_intel.cpp
+++ b/src/INTEL/dihedral_opls_intel.cpp
@ -158,7 +158,11 @@ void DihedralOPLSIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n ++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -319,7 +323,11 @@ void DihedralOPLSIntel::eval(const int vflag,
      }

      #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/fix_intel.cpp
+++ b/src/INTEL/fix_intel.cpp
@ -635,19 +635,31 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
    if (_nthreads == 4) {
      acc_t *f_scalar3 = f_scalar2 + f_stride4;
      acc_t *f_scalar4 = f_scalar3 + f_stride4;
-      _use_simd_pragma("vector aligned")
-      _use_simd_pragma("simd")
+      #if defined(USE_OMP_SIMD)
+      #pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4:64)
+      #elif defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma simd
+      #endif
      for (int n = 0; n < o_range; n++)
        f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
    } else if (_nthreads == 2) {
-      _use_simd_pragma("vector aligned")
-      _use_simd_pragma("simd")
+      #if defined(USE_OMP_SIMD)
+      #pragma omp simd aligned(f_scalar,f_scalar2:64)
+      #elif defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma simd
+      #endif
      for (int n = 0; n < o_range; n++)
        f_scalar[n] += f_scalar2[n];
    } else {
      acc_t *f_scalar3 = f_scalar2 + f_stride4;
-      _use_simd_pragma("vector aligned")
-      _use_simd_pragma("simd")
+      #if defined(USE_OMP_SIMD)
+      #pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3:64)
+      #elif defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma simd
+      #endif
      for (int n = 0; n < o_range; n++)
        f_scalar[n] += f_scalar2[n] + f_scalar3[n];
    }
@ -662,8 +674,12 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)

      acc_t *f_scalar2 = f_scalar + f_stride4;
      for (int t = 1; t < _nthreads; t++) {
-        _use_simd_pragma("vector aligned")
-        _use_simd_pragma("simd")
+        #if defined(USE_OMP_SIMD)
+        #pragma omp simd aligned(f_scalar,f_scalar2:64)
+        #elif defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma simd
+        #endif
        for (int n = iifrom; n < iito; n++)
          f_scalar[n] += f_scalar2[n];
        f_scalar2 += f_stride4;
--- a/src/INTEL/fix_nh_intel.cpp
+++ b/src/INTEL/fix_nh_intel.cpp
@ -99,8 +99,12 @@ void FixNHIntel::remap()

  if (allremap) {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
      const double d0 = x[i].x - b0;
@ -112,8 +116,12 @@ void FixNHIntel::remap()
    }
  } else {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
      if (mask[i] & dilate_group_bit) {
@ -278,8 +286,12 @@ void FixNHIntel::remap()

  if (allremap) {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
      x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
@ -288,8 +300,12 @@ void FixNHIntel::remap()
    }
  } else {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
      if (mask[i] & dilate_group_bit) {
@ -415,8 +431,12 @@ void FixNHIntel::nh_v_press()

  if (igroup == 0) {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
      v[i].x *= f0;
@ -425,8 +445,12 @@ void FixNHIntel::nh_v_press()
    }
  } else {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
      if (mask[i] & groupbit) {
@ -448,8 +472,12 @@ void FixNHIntel::nve_v()
  double * _noalias const v = atom->v[0];
  const double * _noalias const f = atom->f[0];
  #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+  #pragma omp simd
+#else
  #pragma simd
+#endif
+  #pragma vector aligned
  #endif
  for (int i = 0; i < _nlocal3; i++)
    v[i] += _dtfm[i] * f[i];
@ -468,15 +496,23 @@ void FixNHIntel::nve_x()

  if (igroup == 0) {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++)
      x[i] += dtv * v[i];
  } else {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++) {
      if (_dtfm[i] != 0.0)
@ -500,15 +536,23 @@ void FixNHIntel::nh_v_temp()

  if (igroup == 0) {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++)
        v[i] *= factor_eta;
  } else {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++) {
      if (_dtfm[i] != 0.0)
--- a/src/INTEL/fix_nve_asphere_intel.cpp
+++ b/src/INTEL/fix_nve_asphere_intel.cpp
@ -97,8 +97,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
  dtq = 0.5 * dtv;

  #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+  #pragma omp simd
+#else
  #pragma simd
+#endif
+  #pragma vector aligned
  #endif
  for (int i = 0; i < _nlocal3; i++) {
    v[i] += _dtfm[i] * f[i];
@ -108,8 +112,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
  // update angular momentum by 1/2 step
  if (igroup == 0) {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
      double *quat = bonus[ellipsoid[i]].quat;
@ -118,8 +126,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
    }
  } else {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
      if (mask[i] & groupbit) {
@ -143,8 +155,12 @@ void FixNVEAsphereIntel::final_integrate()
  const double * _noalias const torque = atom->torque[0];

  #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+  #pragma omp simd
+#else
  #pragma simd
+#endif
+  #pragma vector aligned
  #endif
  for (int i = 0; i < _nlocal3; i++) {
    v[i] += _dtfm[i] * f[i];
--- a/src/INTEL/fix_nve_intel.cpp
+++ b/src/INTEL/fix_nve_intel.cpp
@ -68,8 +68,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
  if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) {
    const double dtfm = dtf / atom->mass[1];
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++) {
      v[i] += dtfm * f[i];
@ -78,8 +82,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
  } else if (igroup == 0) {
    if (neighbor->ago == 0) reset_dt();
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++) {
      v[i] += _dtfm[i] * f[i];
@ -88,8 +96,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
  } else {
    if (neighbor->ago == 0) reset_dt();
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++) {
      if (_dtfm[i] != 0.0) {
@ -112,16 +124,24 @@ void FixNVEIntel::final_integrate()
    _nlocal3 = 3 * atom->nlocal;
    const double dtfm = dtf / atom->mass[1];
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++)
      v[i] += dtfm * f[i];
  } else if (igroup == 0) {
    if (neighbor->ago == 0) reset_dt();
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++) {
      v[i] += _dtfm[i] * f[i];
@ -129,8 +149,12 @@ void FixNVEIntel::final_integrate()
  } else {
    if (neighbor->ago == 0) reset_dt();
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++)
      v[i] += _dtfm[i] * f[i];
--- a/src/INTEL/improper_cvff_intel.cpp
+++ b/src/INTEL/improper_cvff_intel.cpp
@ -165,7 +165,11 @@ void ImproperCvffIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -247,7 +251,11 @@ void ImproperCvffIntel::eval(const int vflag,

      flt_t p, pd;
      #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (m == 2) {
@ -319,7 +327,11 @@ void ImproperCvffIntel::eval(const int vflag,
      // apply force to each of 4 atoms

      #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/improper_harmonic_intel.cpp
+++ b/src/INTEL/improper_harmonic_intel.cpp
@ -167,7 +167,11 @@ void ImproperHarmonicIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -276,7 +280,11 @@ void ImproperHarmonicIntel::eval(const int vflag,
      // apply force to each of 4 atoms

      #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/intel_intrinsics.h
+++ b/src/INTEL/intel_intrinsics.h
@ -127,7 +127,8 @@ struct vector_ops<double, KNC> {
    }
    template<int scale>
    static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) {
-      return _mm512_mask_i32logather_pd(from, mask, idx, base, scale);
+      return _mm512_mask_i32gather_pd(from, mask, _mm512_castsi512_si256(idx),
+                                      base, scale);
    }
    static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
      return _mm512_mask_blend_pd(mask, a, b);
--- a/src/INTEL/intel_intrinsics_airebo.h
+++ b/src/INTEL/intel_intrinsics_airebo.h
@ -511,7 +511,8 @@ public:
                                     const int scale) {
    assert(scale == sizeof(FVEC_SCAL_T));
 #   if FVEC_LEN==8
-    return FVEC_SUFFIX(_mm512_i32logather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
+    return FVEC_SUFFIX(_mm512_i32gather_)(_mm512_castsi512_si256(idx.val_),
+                                          mem, sizeof(FVEC_SCAL_T));
 #   else
    return FVEC_SUFFIX(_mm512_i32gather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
 #   endif
@ -522,8 +523,8 @@ public:
  ) {
    assert(scale == sizeof(FVEC_SCAL_T));
 #   if FVEC_LEN==8
-    return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
-                       mem, sizeof(FVEC_SCAL_T));
+    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_,
+                _mm512_castsi512_si256(idx.val_), mem, sizeof(FVEC_SCAL_T));
 #   else
    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
                       mem, sizeof(FVEC_SCAL_T));
@ -609,8 +610,8 @@ public:
  ) {
    assert(scale == sizeof(FVEC_SCAL_T));
 #   if FVEC_LEN==8
-    return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
-                                                 mem, sizeof(FVEC_SCAL_T));
+    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_,
+              _mm512_castsi512_si256(idx.val_), mem, sizeof(FVEC_SCAL_T));
 #   else
    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
                                               mem, sizeof(FVEC_SCAL_T));
@ -622,8 +623,9 @@ public:
  ) {
    assert(scale == sizeof(FVEC_SCAL_T));
 #   if FVEC_LEN==8
-    FVEC_SUFFIX(_mm512_mask_i32loscatter_)(mem, mask.val_, idx.val_, a.val_,
-                                           sizeof(FVEC_SCAL_T));
+    FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, 
+                                         _mm512_castsi512_si256(idx.val_),
+                                         a.val_, sizeof(FVEC_SCAL_T));
 #   else
    FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, idx.val_, a.val_,
                                         sizeof(FVEC_SCAL_T));
@ -666,11 +668,11 @@ public:
      const double * mem, const int scale
  ) {
    assert(scale == sizeof(double));
-    __m512d lo = _mm512_mask_i32logather_pd(src.lo_, mask.val_, idx.val_, mem,
-                                            sizeof(double));
-    __m512d hi = _mm512_mask_i32logather_pd(src.hi_, get_bvec_hi(mask.val_),
-                                            get_ivec_hi(idx.val_), mem,
-                                            sizeof(double));
+    __m512d lo = _mm512_mask_i32gather_pd(src.lo_, mask.val_, 
+                                          _mm512_castsi512_si256(idx.val_),
+                                          mem, sizeof(double));
+    __m512d hi = _mm512_mask_i32gather_pd(src.hi_, get_bvec_hi(mask.val_),
+         _mm512_castsi512_si256(get_ivec_hi(idx.val_)), mem, sizeof(double));
    return avec16pd(lo, hi);
  }
  VEC_INLINE static void mask_i32loscatter(
@ -678,10 +680,12 @@ public:
      const avec16pd &a, const int scale
  ) {
    assert(scale == sizeof(double));
-    _mm512_mask_i32loscatter_pd(mem, mask.val_, idx.val_, a.lo_,
-                                sizeof(double));
-    _mm512_mask_i32loscatter_pd(mem, get_bvec_hi(mask.val_),
-                                get_ivec_hi(idx.val_), a.hi_, sizeof(double));
+    _mm512_mask_i32scatter_pd(mem, mask.val_,
+                              _mm512_castsi512_si256(idx.val_), a.lo_,
+                              sizeof(double));
+    _mm512_mask_i32scatter_pd(mem, get_bvec_hi(mask.val_),
+                              _mm512_castsi512_si256(get_ivec_hi(idx.val_)),
+                              a.hi_, sizeof(double));
  }

  #define AVEC2_BINOP(the_sym, the_name)                                    \
--- a/src/INTEL/intel_preprocess.h
+++ b/src/INTEL/intel_preprocess.h
@ -17,8 +17,13 @@
 ------------------------------------------------------------------------- */

 #ifdef __INTEL_LLVM_COMPILER
+#define USE_OMP_SIMD
 #define __INTEL_COMPILER __INTEL_LLVM_COMPILER
 #define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER
+#define _MM_SCALE_1 1
+#define _MM_SCALE_2 2
+#define _MM_SCALE_4 4
+#define _MM_SCALE_8 8
 #endif

 #ifdef __INTEL_COMPILER
@ -332,6 +337,9 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,

 #endif

+// TO BE DEPRECATED
+#ifndef USE_OMP_SIMD 
+
 #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
                                  f_stride, pos, ov0, ov1, ov2,         \
                                  ov3, ov4, ov5)                        \
@ -526,6 +534,198 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
  }                                                                     \
 }

+#else
+
+#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
+                                  f_stride, pos, ov0, ov1, ov2,         \
+                                  ov3, ov4, ov5)                        \
+{                                                                       \
+  acc_t *f_scalar = &f_start[0].x;                                      \
+  flt_t *x_scalar = &pos[minlocal].x;                                   \
+  int f_stride4 = f_stride * 4;                                         \
+  _alignvar(acc_t ovv[16],64);                                          \
+  int vwidth;                                                           \
+  if (sizeof(acc_t) == sizeof(double))                                  \
+    vwidth = INTEL_COMPILE_WIDTH/2;                                     \
+  else                                                                  \
+    vwidth = INTEL_COMPILE_WIDTH;                                       \
+  if (vwidth < 4) vwidth = 4;                                           \
+  _use_simd_pragma("omp simd aligned(ovv:64)")                          \
+  for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0;                 \
+  int remainder = lt % vwidth;                                          \
+  if (lf > lt) remainder = 0;                                           \
+  const int v_range = lt - remainder;                                   \
+  if (nthreads == 2) {                                                  \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,ovv,x_scalar:64)")\
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v];                                \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n];                                      \
+  } else if (nthreads==4) {                                             \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
+    acc_t *f_scalar4 = f_scalar3 + f_stride4;                           \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4,ovv:64)") \
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] +              \
+          f_scalar4[n+v];                                               \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];        \
+  } else if (nthreads==1) {                                             \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("omp simd aligned(ovv,f_scalar,x_scalar:64)")    \
+      for (int v = 0; v < vwidth; v++)                                  \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+  } else if (nthreads==3) {                                             \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,ovv,x_scalar:64)") \
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v];               \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n] + f_scalar3[n];                       \
+  }                                                                     \
+  for (int n = v_range; n < lt; n += 4) {                               \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    for (int v = 0; v < 4; v++)                                         \
+      ovv[v] += f_scalar[n+v] * x_scalar[n+v];                          \
+    ov3 += f_scalar[n+1] * x_scalar[n+0];                               \
+    ov4 += f_scalar[n+2] * x_scalar[n+0];                               \
+    ov5 += f_scalar[n+2] * x_scalar[n+1];                               \
+  }                                                                     \
+  ov0 += ovv[0];                                                        \
+  ov1 += ovv[1];                                                        \
+  ov2 += ovv[2];                                                        \
+  if (vwidth > 4) {                                                     \
+    ov0 += ovv[4];                                                      \
+    ov1 += ovv[5];                                                      \
+    ov2 += ovv[6];                                                      \
+  }                                                                     \
+  if (vwidth > 8) {                                                     \
+    ov0 += ovv[8] + ovv[12];                                            \
+    ov1 += ovv[9] + ovv[13];                                            \
+    ov2 += ovv[10] + ovv[14];                                           \
+  }                                                                     \
+}
+
+#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,       \
+                               f_stride, pos, offload, vflag, ov0, ov1, \
+                               ov2, ov3, ov4, ov5)                      \
+{                                                                       \
+  int o_range = (nall - minlocal) * 4;                                  \
+  IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads,       \
+                            sizeof(acc_t));                             \
+                                                                        \
+  acc_t *f_scalar = &f_start[0].x;                                      \
+  int f_stride4 = f_stride * 4;                                         \
+  int t;                                                                \
+  if (vflag == VIRIAL_FDOTR) t = 4; else t = 1;                         \
+  acc_t *f_scalar2 = f_scalar + f_stride4 * t;                          \
+  for ( ; t < nthreads; t++) {                                          \
+    _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2:64)")         \
+    for (int n = iifrom; n < iito; n++)                                 \
+      f_scalar[n] += f_scalar2[n];                                      \
+    f_scalar2 += f_stride4;                                             \
+  }                                                                     \
+                                                                        \
+  if (vflag == VIRIAL_FDOTR) {                                          \
+    int nt_min = MIN(4,nthreads);                                       \
+    IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start,  \
+                              f_stride, pos, ov0, ov1, ov2, ov3, ov4,   \
+                              ov5);                                     \
+  }                                                                     \
+}
+
+#endif
+
 #ifdef _LMP_INTEL_OFFLOAD
 #include <sys/time.h>

--- a/src/INTEL/intel_simd.h
+++ b/src/INTEL/intel_simd.h
@ -173,7 +173,7 @@ namespace ip_simd {
  }

  inline SIMD_double SIMD_gather(const double *p, const SIMD_int &i) {
-    return _mm512_i32logather_pd(i, p, _MM_SCALE_8);
+    return _mm512_i32gather_pd(_mm512_castsi512_si256(i), p, _MM_SCALE_8);
  }

  inline SIMD_int SIMD_gather(const SIMD_mask &m, const int *p,
@ -190,8 +190,8 @@ namespace ip_simd {

  inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p,
                                 const SIMD_int &i) {
-    return _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, p,
-                                      _MM_SCALE_8);
+    return _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), p, _MM_SCALE_8);
  }

  template <typename T>
@ -227,8 +227,8 @@ namespace ip_simd {

  inline SIMD_double SIMD_gatherz(const SIMD_mask &m, const double *p,
                                  const SIMD_int &i) {
-    return _mm512_mask_i32logather_pd( _mm512_set1_pd(0.0), m, i, p,
-                                      _MM_SCALE_8);
+    return _mm512_mask_i32gather_pd( _mm512_set1_pd(0.0), m,
+                                     _mm512_castsi512_si256(i),p, _MM_SCALE_8);
  }

  // ------- Store Operations
@ -257,7 +257,8 @@ namespace ip_simd {

  inline void SIMD_scatter(const SIMD_mask &m, double *p,
                           const SIMD_int &i, const SIMD_double &vec) {
-    _mm512_mask_i32loscatter_pd(p, m, i, vec, _MM_SCALE_8);
+    _mm512_mask_i32scatter_pd(p, m, _mm512_castsi512_si256(i), vec,
+                              _MM_SCALE_8);
  }

  // ------- Arithmetic Operations
@ -834,23 +835,29 @@ namespace ip_simd {
  inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
                               const SIMD_int &i, SIMD_double &x,
                               SIMD_double &y, SIMD_double &z) {
-    x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
-                                   _MM_SCALE_2);
-    y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
-                                   _MM_SCALE_2);
-    z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
-                                   _MM_SCALE_2);
+    x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                 _mm512_castsi512_si256(i), atom,
+                                 _MM_SCALE_2);
+    y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                 _mm512_castsi512_si256(i), atom+1,
+                                 _MM_SCALE_2);
+    z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                 _mm512_castsi512_si256(i), atom+2,
+                                 _MM_SCALE_2);
  }

  inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
                               const SIMD_int &i, SIMD_double &x,
                               SIMD_double &y, SIMD_double &z, SIMD_int &type) {
-    x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
-                                   _MM_SCALE_2);
-    y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
-                                   _MM_SCALE_2);
-    z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
-                                   _MM_SCALE_2);
+    x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                 _mm512_castsi512_si256(i), atom,
+                                 _MM_SCALE_2);
+    y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                 _mm512_castsi512_si256(i), atom+1,
+                                 _MM_SCALE_2);
+    z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                 _mm512_castsi512_si256(i), atom+2,
+                                 _MM_SCALE_2);
    type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3,
                                       _MM_SCALE_2);
  }
@ -888,10 +895,12 @@ namespace ip_simd {
                               const SIMD_int &joffset, SIMD_double &eng) {
    SIMD_double jeng;
    SIMD_conflict_pi_reduce1(rmask, joffset, eng);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
+                                    force, _MM_SCALE_2);
    jeng = jeng + eng;
-    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
+                              jeng, _MM_SCALE_2);
  }

  inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force,
@ -899,20 +908,24 @@ namespace ip_simd {
    SIMD_double engd, jeng;
    engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng));
    SIMD_conflict_pi_reduce1(rmask, joffset, engd);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
+                                    force, _MM_SCALE_2);
    jeng = jeng + engd;
-    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
+                              jeng, _MM_SCALE_2);

    SIMD_mask rmask2 = rmask >> 8;
    engd = _mm512_cvtps_pd(_mm512_castps512_ps256(
                             _mm512_shuffle_f32x4(eng,eng,238)));
    SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
    SIMD_conflict_pi_reduce1(rmask2, joffset2, engd);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
-                                      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, 
+                                    _mm512_castsi512_si256(joffset2),
+                                    force, _MM_SCALE_2);
    jeng = jeng + engd;
-    _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jeng, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2),
+                              jeng, _MM_SCALE_2);
  }

  inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force,
@ -926,10 +939,12 @@ namespace ip_simd {

    SIMD_double jeng;
    SIMD_conflict_pi_reduce1(rmask, joffset, eng);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
+                                    force, _MM_SCALE_2);
    jeng = jeng + eng;
-    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
+                              jeng, _MM_SCALE_2);
  }

  inline void SIMD_safe_jforce(const SIMD_mask &m, float *force,
@ -956,18 +971,24 @@ namespace ip_simd {
                               SIMD_double &fy, SIMD_double &fz) {
    SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
    SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
-                                      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                    _mm512_castsi512_si256(i), force,
+                                    _MM_SCALE_2);
    jfrc = jfrc + fx;
-    _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force + 1,
+                                    _MM_SCALE_2);
    jfrc = jfrc + fy;
-    _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                    _mm512_castsi512_si256(i), force + 2,
+                                    _MM_SCALE_2);
    jfrc = jfrc + fz;
-    _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
  }

  inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force,
@ -979,40 +1000,54 @@ namespace ip_simd {
    amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz));
    SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd);
    SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
+                                    force, _MM_SCALE_2);
    jfrc = jfrc + amxd;
-    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force + 1, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
+                              jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
+                                    force + 1, _MM_SCALE_2);
    jfrc = jfrc + amyd;
-    _mm512_mask_i32loscatter_pd(force+1, rmask, joffset, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force + 2, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+1, rmask, _mm512_castsi512_si256(joffset),
+                              jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
+                                    _mm512_castsi512_si256(joffset),
+                                    force + 2, _MM_SCALE_2);
    jfrc = jfrc + amzd;
-    _mm512_mask_i32loscatter_pd(force+2, rmask, joffset, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, rmask, _mm512_castsi512_si256(joffset),
+                              jfrc, _MM_SCALE_2);

    SIMD_mask rmask2 = rmask >> 8;
    amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-                                _mm512_shuffle_f32x4(amx,amx,238)));
+                                                  _mm512_shuffle_f32x4(amx,amx,238)));
    amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-                                _mm512_shuffle_f32x4(amy,amy,238)));
+                                                  _mm512_shuffle_f32x4(amy,amy,238)));
    amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-                                _mm512_shuffle_f32x4(amz,amz,238)));
+                                                  _mm512_shuffle_f32x4(amz,amz,238)));
    SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
    SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
-                                      force, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, 
+                                    _mm512_castsi512_si256(joffset2),
+                                    force, _MM_SCALE_2);
    jfrc = jfrc + amxd;
-    _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
-                                      force + 1, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2),
+                              jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, 
+                                    _mm512_castsi512_si256(joffset2),
+                                    force + 1, _MM_SCALE_2);
    jfrc = jfrc + amyd;
-    _mm512_mask_i32loscatter_pd(force+1, rmask2, joffset2, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
-                                      force + 2, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+1, rmask2,
+                              _mm512_castsi512_si256(joffset2), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
+                                    _mm512_castsi512_si256(joffset2),
+                                    force + 2, _MM_SCALE_2);
    jfrc = jfrc + amzd;
-    _mm512_mask_i32loscatter_pd(force+2, rmask2, joffset2, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, rmask2, 
+                              _mm512_castsi512_si256(joffset2), jfrc,
+                              _MM_SCALE_2);
  }

  inline void SIMD_jforce_update(const SIMD_mask &m, float *force,
@ -1064,18 +1099,24 @@ namespace ip_simd {
                                 const SIMD_int &i, const SIMD_double &fx,
                                 const SIMD_double &fy, const SIMD_double &fz)   {
    SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
-                                      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                    _mm512_castsi512_si256(i), force,
+                                    _MM_SCALE_2);
    jfrc = jfrc - fx;
-    _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force + 1,
+                                    _MM_SCALE_2);
    jfrc = jfrc - fy;
-    _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force + 2,
+                                    _MM_SCALE_2);
    jfrc = jfrc - fz;
-    _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
  }

  inline void SIMD_jforce_update(const SIMD_mask &rmask,
@ -1502,11 +1543,12 @@ namespace ip_simd {
      fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
      fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
      SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
-                                                    hmask, k, force + 3,
-                                                    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask,
+                                                  _mm512_castsi512_si256(k),
+                                                  force + 3, _MM_SCALE_2);
      keng = keng + hevdwl;
-      _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
+      _mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k),
+                                keng, _MM_SCALE_2);
    }
  }

@ -1523,11 +1565,12 @@ namespace ip_simd {
      fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
      fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
      SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
-                                                    hmask, k, force + 3,
-                                                    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask,
+                                                  _mm512_castsi512_si256(k),
+                                                  force + 3, _MM_SCALE_2);
      keng = keng + hevdwl;
-      _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
+      _mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k),
+                                keng, _MM_SCALE_2);
    }
    SIMD_mask hmask2 = hmask >> 8;
    facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(
@ -1539,11 +1582,13 @@ namespace ip_simd {
      fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl);
      SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238);
      SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
-                                                    hmask2, k2, force + 3,
-                                                    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(),
+                                                  hmask2,
+                                                  _mm512_castsi512_si256(k2),
+                                                  force + 3, _MM_SCALE_2);
      keng = keng + hevdwl;
-      _mm512_mask_i32loscatter_pd(force + 3, hmask2, k2, keng, _MM_SCALE_2);
+      _mm512_mask_i32scatter_pd(force + 3, hmask2, _mm512_castsi512_si256(k2),
+                                keng, _MM_SCALE_2);
    }
  }

@ -1815,24 +1860,32 @@ namespace ip_simd {
                                 const int EFLAG, const int eatom,
                                 const SIMD_double &fwtmp) {
    SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
-                                      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force,
+                                    _MM_SCALE_2);
    jfrc = jfrc + fx;
-    _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                    _mm512_castsi512_si256(i), force + 1,
+                                    _MM_SCALE_2);
    jfrc = jfrc + fy;
-    _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force + 2,
+                                    _MM_SCALE_2);
    jfrc = jfrc + fz;
-    _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
    if (EFLAG) {
      if (eatom) {
-        jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i,
-                                          force + 3, _MM_SCALE_2);
+        jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                        _mm512_castsi512_si256(i),
+                                        force + 3, _MM_SCALE_2);
        jfrc = jfrc + fwtmp;
-        _mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2);
+        _mm512_mask_i32scatter_pd(force+3, m, _mm512_castsi512_si256(i), jfrc,
+                                  _MM_SCALE_2);
      }
    }
  }
--- a/src/INTEL/npair_full_bin_ghost_intel.cpp
+++ b/src/INTEL/npair_full_bin_ghost_intel.cpp
@ -324,7 +324,11 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
              const int bstart = binhead[ibin + binstart[k]];
              const int bend = binhead[ibin + binend[k]];
              #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+              #pragma omp simd
+#else
              #pragma simd
+#endif
              #endif
              for (int jj = bstart; jj < bend; jj++)
                tj[ncount++] = binpacked[jj];
@ -345,15 +349,23 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
              const int bstart = binhead[ibin + stencil[k]];
              const int bend = binhead[ibin + stencil[k] + 1];
              #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+              #pragma omp simd
+#else
              #pragma simd
+#endif
              #endif
              for (int jj = bstart; jj < bend; jj++)
                tj[ncount++] = binpacked[jj];
            }
          } // if i < nlocal
          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma vector aligned
          #endif
          for (int u = 0; u < ncount; u++) {
            const int j = tj[u];
@ -425,12 +437,16 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
          int alln = n;
          n = 0;
          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
          #ifdef LMP_INTEL_NBOR_COMPAT
          #pragma ivdep
          #else
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
          #endif
+          #pragma vector aligned
          #endif
          for (int u = 0; u < alln; u++) {
            int which;
@ -454,12 +470,16 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
          alln = n2;
          n2 = maxnbors * 2;
          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
          #ifdef LMP_INTEL_NBOR_COMPAT
          #pragma ivdep
          #else
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
          #endif
+          #pragma vector aligned
          #endif
          for (int u = n2; u < alln; u++) {
            int which;
--- a/src/INTEL/npair_intel.cpp
+++ b/src/INTEL/npair_intel.cpp
@ -344,14 +344,22 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
            const int bstart = binhead[ibin + binstart[k]];
            const int bend = binhead[ibin + binend[k]];
            #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
            #pragma simd
+#endif
            #endif
            for (int jj = bstart; jj < bend; jj++)
              tj[ncount++] = binpacked[jj];
          }
          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma vector aligned
          #endif
          for (int u = 0; u < ncount; u++) {
            const int j = tj[u];
@ -375,7 +383,11 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
            const int bstart = binhead[ibin];
            const int bend = binhead[ibin + 1];
            #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
            #pragma simd
+#endif
            #endif
            for (int jj = bstart; jj < bend; jj++) {
              const int j = binpacked[jj];
@ -533,12 +545,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,

          n = pack_offset;
          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
          #ifdef LMP_INTEL_NBOR_COMPAT
          #pragma ivdep
          #else
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
          #endif
+          #pragma vector aligned
          #endif
          for (int u = n; u < alln; u++) {
            int which;
@ -566,12 +582,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
            n2 = pack_offset + maxnbors;

            #if defined(LMP_SIMD_COMPILER)
-            #pragma vector aligned
            #ifdef LMP_INTEL_NBOR_COMPAT
            #pragma ivdep
            #else
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
            #pragma simd
+#endif
            #endif
+            #pragma vector aligned
            #endif
            for (int u = n2; u < alln; u++) {
              int which;
@ -737,8 +757,14 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
          int jnum = numneigh[i];
          if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
          #if __INTEL_COMPILER+0 > 1499
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd reduction(max:vlmax,vgmax) \
+            reduction(min:vlmin, vgmin)
+#else
+          #pragma simd reduction(max:vlmax,vgmax) \
+            reduction(min:vlmin, vgmin)
+#endif
          #pragma vector aligned
-          #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
          #endif
          for (int jj = 0; jj < jnum; jj++) {
            const int j = jlist[jj] & NEIGHMASK;
@ -782,8 +808,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
          int jnum = numneigh[i];
          if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
          int jj = 0;
-          #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma vector aligned
          for (jj = 0; jj < jnum; jj++) {
            const int which = jlist[jj] >> SBBITS & 3;
            const int j = jlist[jj] & NEIGHMASK;
--- a/src/INTEL/pair_airebo_intel.cpp
+++ b/src/INTEL/pair_airebo_intel.cpp
@ -292,8 +292,9 @@ void PairAIREBOIntel::compute(
  ev_init(eflag,vflag);
  if (vflag_atom)
    error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");

  pvector[0] = pvector[1] = pvector[2] = 0.0;

--- a/src/INTEL/pair_buck_coul_cut_intel.cpp
+++ b/src/INTEL/pair_buck_coul_cut_intel.cpp
@ -77,8 +77,9 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag,
  ev_init(eflag,vflag);
  if (vflag_atom)
    error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");

  const int inum = list->inum;
  const int nthreads = comm->nthreads;
@ -248,12 +249,18 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
        fxtmp = fytmp = fztmp = (acc_t)0;
        if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
        if (NEWTON_PAIR == 0)
-          if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+          if (vflag == VIRIAL_PAIR)
+            sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;

        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)
+#else
        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                               sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
+        #pragma vector aligned
        #endif
        for (int jj = 0; jj < jnum; jj++) {
          flt_t forcecoul, forcebuck, evdwl, ecoul;
--- a/src/INTEL/pair_buck_coul_long_intel.cpp
+++ b/src/INTEL/pair_buck_coul_long_intel.cpp
@ -77,8 +77,9 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag,
  ev_init(eflag,vflag);
  if (vflag_atom)
    error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");

  const int inum = list->inum;
  const int nthreads = comm->nthreads;
@ -309,9 +310,14 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
        }

        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
        #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-                                 sv0, sv1, sv2, sv3, sv4, sv5)
        #endif
        for (int jj = 0; jj < ej; jj++) {
          flt_t forcecoul, forcebuck, evdwl, ecoul;
--- a/src/INTEL/pair_buck_intel.cpp
+++ b/src/INTEL/pair_buck_intel.cpp
@ -70,8 +70,9 @@ void PairBuckIntel::compute(int eflag, int vflag,
  ev_init(eflag,vflag);
  if (vflag_atom)
    error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");

  const int inum = list->inum;
  const int nthreads = comm->nthreads;
@ -230,12 +231,18 @@ void PairBuckIntel::eval(const int offload, const int vflag,
        fxtmp = fytmp = fztmp = (acc_t)0;
        if (EFLAG) fwtmp = sevdwl =  (acc_t)0;
        if (NEWTON_PAIR == 0)
-          if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+          if (vflag == VIRIAL_PAIR) 
+            sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;

        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)
+#else
        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                               sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
+        #pragma vector aligned
        #endif
        for (int jj = 0; jj < jnum; jj++) {

--- a/src/INTEL/pair_dpd_intel.cpp
+++ b/src/INTEL/pair_dpd_intel.cpp
@ -89,8 +89,9 @@ void PairDPDIntel::compute(int eflag, int vflag,
  ev_init(eflag, vflag);
  if (vflag_atom)
    error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");

  const int inum = list->inum;
  const int nthreads = comm->nthreads;
@ -289,9 +290,14 @@ void PairDPDIntel::eval(const int offload, const int vflag,
        }

        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)
+#else
        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-                                 sv0, sv1, sv2, sv3, sv4, sv5)
+                               sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
+        #pragma vector aligned
        #endif
        for (int jj = 0; jj < jnum; jj++) {
          flt_t forcelj, evdwl;
--- a/src/INTEL/pair_eam_intel.cpp
+++ b/src/INTEL/pair_eam_intel.cpp
@ -82,8 +82,9 @@ void PairEAMIntel::compute(int eflag, int vflag,
  ev_init(eflag, vflag);
  if (vflag_atom)
    error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");

  const int inum = list->inum;
  const int nthreads = comm->nthreads;
@ -327,8 +328,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
        }

        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:rhoi)
+#else
        #pragma simd reduction(+:rhoi)
+#endif
+        #pragma vector aligned
        #endif
        for (int jj = 0; jj < ej; jj++) {
          int jtype;
@ -369,23 +374,35 @@ void PairEAMIntel::eval(const int offload, const int vflag,
          const int rcount = nall;
          if (nthreads == 2) {
            double *trho2 = rho + nmax;
-            #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
            #pragma simd
+#endif
+            #pragma vector aligned
            for (int n = 0; n < rcount; n++)
              rho[n] += trho2[n];
          } else if (nthreads == 4) {
            double *trho2 = rho + nmax;
            double *trho3 = trho2 + nmax;
            double *trho4 = trho3 + nmax;
-            #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
            #pragma simd
+#endif
+            #pragma vector aligned
            for (int n = 0; n < rcount; n++)
              rho[n] += trho2[n] + trho3[n] + trho4[n];
          } else {
            double *trhon = rho + nmax;
            for (int t = 1; t < nthreads; t++) {
-              #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+              #pragma omp simd
+#else
              #pragma simd
+#endif
+              #pragma vector aligned
              for (int n = 0; n < rcount; n++)
                rho[n] += trhon[n];
              trhon += nmax;
@ -414,8 +431,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
      if (EFLAG) tevdwl = (acc_t)0.0;

      #if defined(LMP_SIMD_COMPILER)
-      #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd reduction(+:tevdwl)
+#else
      #pragma simd reduction(+:tevdwl)
+#endif
+      #pragma vector aligned
      #endif
      for (int ii = iifrom; ii < iito; ++ii) {
        const int i = ilist[ii];
@ -510,9 +531,14 @@ void PairEAMIntel::eval(const int offload, const int vflag,
        }

        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)
+#else
        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-                                 sv0, sv1, sv2, sv3, sv4, sv5)
+                               sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
+        #pragma vector aligned
        #endif
        for (int jj = 0; jj < ej; jj++) {
          int jtype;
--- a/src/INTEL/pair_gayberne_intel.cpp
+++ b/src/INTEL/pair_gayberne_intel.cpp
@ -76,8 +76,9 @@ void PairGayBerneIntel::compute(int eflag, int vflag,
  ev_init(eflag, vflag);
  if (vflag_atom)
    error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");

  const int inum = list->inum;
  const int nall = atom->nlocal + atom->nghost;
@ -449,9 +450,14 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
        __assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
        #endif
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp, \
+                                   t3tmp,sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
+#else
+        #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp, \
+                               t3tmp,sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
+#endif
        #pragma vector aligned
-        #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
-                                 sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
        #endif
        for (int jj = 0; jj < packed_j; jj++) {
          flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
@ -806,8 +812,12 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
        acc_t *f_scalar2 = f_scalar + fst4;
        for (int t = 1; t < nthreads; t++) {
          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma vector aligned
          #endif
          for (int n = iifrom * 8; n < sto; n++)
            f_scalar[n] += f_scalar2[n];
--- a/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
+++ b/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
@ -73,8 +73,9 @@ void PairLJCharmmCoulCharmmIntel::compute(int eflag, int vflag,
  ev_init(eflag,vflag);
  if (vflag_atom)
    error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");

  const int inum = list->inum;
  const int nthreads = comm->nthreads;
@ -294,9 +295,14 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
        }

        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
        #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-                               sv0, sv1, sv2, sv3, sv4, sv5)
        #endif
        for (int jj = 0; jj < ej; jj++) {
          flt_t forcecoul, forcelj, evdwl;
--- a/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
@ -77,8 +77,9 @@ void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
  ev_init(eflag,vflag);
  if (vflag_atom)
    error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");

  const int inum = list->inum;
  const int nthreads = comm->nthreads;
@ -314,9 +315,14 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
        }

        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
        #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-                               sv0, sv1, sv2, sv3, sv4, sv5)
        #endif
        for (int jj = 0; jj < ej; jj++) {
          flt_t forcecoul, forcelj, evdwl, ecoul;
--- a/src/INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/INTEL/pair_lj_cut_coul_long_intel.cpp
@ -76,8 +76,9 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
  ev_init(eflag,vflag);
  if (vflag_atom)
    error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");

  const int inum = list->inum;
  const int nthreads = comm->nthreads;
@ -305,9 +306,14 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
        }

        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
        #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-                                 sv0, sv1, sv2, sv3, sv4, sv5)
        #endif
        for (int jj = 0; jj < ej; jj++) {
          flt_t forcecoul, forcelj, evdwl, ecoul;
--- a/src/INTEL/pair_lj_cut_intel.cpp
+++ b/src/INTEL/pair_lj_cut_intel.cpp
@ -68,8 +68,9 @@ void PairLJCutIntel::compute(int eflag, int vflag,
  ev_init(eflag, vflag);
  if (vflag_atom)
    error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");

  const int inum = list->inum;
  const int nthreads = comm->nthreads;
@ -241,9 +242,15 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
          if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;

        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)         \
+          aligned(jlist,x,ljc12oi,special_lj,f,lj34i:64)
+#else
        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                               sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma vector aligned
+#endif
        #endif
        for (int jj = 0; jj < jnum; jj++) {
          flt_t forcelj, evdwl;
--- a/src/INTEL/pair_sw_intel.cpp
+++ b/src/INTEL/pair_sw_intel.cpp
@ -97,8 +97,9 @@ void PairSWIntel::compute(int eflag, int vflag,
  ev_init(eflag, vflag);
  if (vflag_atom)
    error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");

  const int inum = list->inum;
  const int nthreads = comm->nthreads;
@ -371,8 +372,12 @@ void PairSWIntel::eval(const int offload, const int vflag,
        }

        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
+#else
        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
+#endif
+        #pragma vector aligned
        #endif
        for (int jj = 0; jj < ejnum_pad; jj++) {
          acc_t fjxtmp, fjytmp, fjztmp, fjtmp;
--- a/src/INTEL/pair_tersoff_intel.cpp
+++ b/src/INTEL/pair_tersoff_intel.cpp
@ -91,8 +91,9 @@ void PairTersoffIntel::compute(int eflag, int vflag,
  ev_init(eflag,vflag);
  if (vflag_atom)
    error->all(FLERR,"INTEL package does not support per-atom stress");
-  if (vflag && !vflag_fdotr)
-    error->all(FLERR,"INTEL package does not support pair_modify nofdotr");
+  if (vflag && !vflag_fdotr && force->newton_pair)
+    error->all(FLERR,"INTEL package does not support pair_modify nofdotr "
+               "with newton on");

  const int inum = list->inum;
  const int nthreads = comm->nthreads;
--- a/src/INTEL/pppm_disp_intel.cpp
+++ b/src/INTEL/pppm_disp_intel.cpp
@ -770,8 +770,12 @@ void PPPMDispIntel::particle_map(double delx, double dely, double delz,
    IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));

    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:flag)
+#else
    #pragma simd reduction(+:flag)
+#endif
+    #pragma vector aligned
    #endif
    for (int i = iifrom; i < iito; i++) {

@ -876,7 +880,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho_lookup[idx][k];
@ -885,7 +893,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower; k <= nupper; k++) {
          FFT_SCALAR r1,r2,r3;
@ -917,8 +929,12 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          int mzy = m*nix + mz;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mzyx = l + mzy;
@ -939,7 +955,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);

    #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
    #endif
    for (int i = ifrom; i < ito; i++) {
      for (int j = 1; j < nthr; j++) {
@ -1025,7 +1045,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho6_lookup[idx][k];
@ -1034,7 +1058,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1,r2,r3;
@ -1067,8 +1095,12 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          int mzy = m*nix + mz;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mzyx = l + mzy;
@ -1089,7 +1121,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr);

    #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
    #endif
    for (int i = ifrom; i < ito; i++) {
      for (int j = 1; j < nthr; j++) {
@ -1173,7 +1209,11 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho6_lookup[idx][k];
@ -1182,7 +1222,11 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1,r2,r3;
@ -1215,8 +1259,12 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          int my = m + nysum;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mx = l + nxsum;
@ -1307,7 +1355,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho6_lookup[idx][k];
@ -1316,7 +1368,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1,r2,r3;
@ -1349,8 +1405,12 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          int mzy = m*nix + mz;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mzyx = l + mzy;
@ -1373,7 +1433,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr);

    #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
    #endif
    for (int i = ifrom; i < ito; i++) {
      for (int j = 1; j < nthr; j++) {
@ -1454,7 +1518,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho0[k] = rho_lookup[idx][k];
@ -1463,7 +1531,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower; k <= nupper; k++) {
          FFT_SCALAR r1 = rho_coeff[order-1][k];
@ -1498,8 +1570,12 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          int my = m+nysum;
          FFT_SCALAR y0 = z0*rho1[m];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mx = l+nxsum;
@ -1624,7 +1700,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        int idz = dz;

        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho_lookup[idx][k];
@ -1636,7 +1716,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower; k <= nupper; k++) {
          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@ -1680,8 +1764,12 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mx = l + nxsum;
@ -1702,7 +1790,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
      }
    }
    #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
    #endif
    for (int i = ifrom; i < ito; i++) {
      particle_ekx[i] *= hx_inv;
@ -1802,7 +1894,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho0[k] = rho6_lookup[idx][k];
@ -1811,7 +1907,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
@ -1846,8 +1946,12 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          int my = m+nysum;
          FFT_SCALAR y0 = z0*rho1[m];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mx = l+nxsum;
@ -1967,7 +2071,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        int idz = dz;

        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho6_lookup[idx][k];
@ -1979,7 +2087,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@ -2023,8 +2135,12 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mx = l + nxsum;
@ -2045,7 +2161,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
      }
    }
    #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
    #endif
    for (int i = ifrom; i < ito; i++) {
      particle_ekx[i] *= hx_inv;
@ -2143,7 +2263,11 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho0[k] = rho6_lookup[idx][k];
@ -2152,7 +2276,11 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
@ -2206,8 +2334,12 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          int my = m+nysum;
          FFT_SCALAR y0 = z0*rho1[m];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mx = l+nxsum;
@ -2398,7 +2530,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        int idz = dz;

        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho6_lookup[idx][k];
@ -2410,7 +2546,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@ -2479,8 +2619,12 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mx = l + nxsum;
@ -2541,7 +2685,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
      }
    }
    #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
    #endif
    for (int i = ifrom; i < ito; i++) {
      particle_ekx0[i] *= hx_inv;
@ -2671,7 +2819,11 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho0[k] = rho6_lookup[idx][k];
@ -2680,7 +2832,11 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
@ -2721,8 +2877,12 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
            int my = m+nysum;
            FFT_SCALAR y0 = z0*rho1[m];
            #if defined(LMP_SIMD_COMPILER)
-            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
            #pragma simd
+#endif
+            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
            #endif
            for (int l = 0; l < order; l++) {
              int mx = l+nxsum;
@ -2848,7 +3008,11 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        int idz = dz;

        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho6_lookup[idx][k];
@ -2860,7 +3024,11 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@ -2909,8 +3077,12 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
            FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
            FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
            #if defined(LMP_SIMD_COMPILER)
-            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
            #pragma simd
+#endif
+            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
            #endif
            for (int l = 0; l < order; l++) {
              int mx = l + nxsum;
@ -2992,7 +3164,11 @@ void PPPMDispIntel::precompute_rho()
    for (int i = 0; i < rho_points; i++) {
      FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
      #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd
+#else
      #pragma simd
+#endif
      #endif
      for (int k=nlower; k<=nupper;k++) {
        FFT_SCALAR r1 = ZEROF;
@ -3006,7 +3182,11 @@ void PPPMDispIntel::precompute_rho()
      }
      if (differentiation_flag == 1) {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k=nlower; k<=nupper;k++) {
          FFT_SCALAR r1 = ZEROF;
@ -3026,7 +3206,11 @@ void PPPMDispIntel::precompute_rho()
    for (int i = 0; i < rho_points; i++) {
      FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
      #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd
+#else
      #pragma simd
+#endif
      #endif
      for (int k=nlower_6; k<=nupper_6;k++) {
        FFT_SCALAR r1 = ZEROF;
@ -3040,7 +3224,11 @@ void PPPMDispIntel::precompute_rho()
      }
      if (differentiation_flag == 1) {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k=nlower_6; k<=nupper_6;k++) {
          FFT_SCALAR r1 = ZEROF;
--- a/src/INTEL/pppm_intel.cpp
+++ b/src/INTEL/pppm_intel.cpp
@ -394,8 +394,12 @@ void PPPMIntel::particle_map(IntelBuffers<flt_t,acc_t> *buffers)
    IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));

    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:flag)
+#else
    #pragma simd reduction(+:flag)
+#endif
+    #pragma vector aligned
    #endif
    for (int i = iifrom; i < iito; i++) {

@ -500,7 +504,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho_lookup[idx][k];
@ -509,7 +517,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower; k <= nupper; k++) {
          FFT_SCALAR r1,r2,r3;
@ -541,7 +553,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
          int mzy = m*nix + mz;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
          #endif
          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
            int mzyx = l + mzy;
@ -563,7 +579,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
      IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);

      #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd
+#else
      #pragma simd
+#endif
      #endif
      for (int i = ifrom; i < ito; i++) {
        for (int j = 1; j < nthr; j++) {
@ -645,7 +665,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho0[k] = rho_lookup[idx][k];
@ -654,7 +678,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower; k <= nupper; k++) {
          FFT_SCALAR r1 = rho_coeff[order-1][k];
@ -690,7 +718,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
          int my = m+nysum;
          FFT_SCALAR y0 = z0*rho1[m];
          #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
          #endif
          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
            int mx = l+nxsum;
@ -813,7 +845,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho_lookup[idx][k];
@ -825,7 +861,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower; k <= nupper; k++) {
          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@ -871,7 +911,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
          #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
          #endif
          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
            int mx = l + nxsum;
@ -893,7 +937,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
    }

    #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
    #endif
    for (int i = ifrom; i < ito; i++) {
      particle_ekx[i] *= hx_inv;
@ -942,7 +990,11 @@ void PPPMIntel::precompute_rho()
  for (int i = 0; i < rho_points; i++) {
    FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
    #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
    #endif
    for (int k=nlower; k<=nupper;k++) {
      FFT_SCALAR r1 = ZEROF;
@ -956,7 +1008,11 @@ void PPPMIntel::precompute_rho()
    }
    if (differentiation_flag == 1) {
      #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd
+#else
      #pragma simd
+#endif
      #endif
      for (int k=nlower; k<=nupper;k++) {
        FFT_SCALAR r1 = ZEROF;
--- a/src/TALLY/compute_force_tally.cpp
+++ b/src/TALLY/compute_force_tally.cpp
@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
@ -14,28 +13,26 @@

 #include "compute_force_tally.h"

-#include <cmath>
 #include "atom.h"
-#include "group.h"
-#include "pair.h"
-#include "update.h"
-#include "memory.h"
+#include "comm.h"
 #include "error.h"
 #include "force.h"
-#include "comm.h"
+#include "group.h"
+#include "memory.h"
+#include "pair.h"
+#include "update.h"
+#include <cmath>

 using namespace LAMMPS_NS;

 /* ---------------------------------------------------------------------- */

-ComputeForceTally::ComputeForceTally(LAMMPS *lmp, int narg, char **arg) :
-  Compute(lmp, narg, arg)
+ComputeForceTally::ComputeForceTally(LAMMPS *lmp, int narg, char **arg) : Compute(lmp, narg, arg)
 {
-  if (narg < 4) error->all(FLERR,"Illegal compute force/tally command");
+  if (narg < 4) error->all(FLERR, "Illegal compute force/tally command");

  igroup2 = group->find(arg[3]);
-  if (igroup2 == -1)
-    error->all(FLERR,"Could not find compute force/tally second group ID");
+  if (igroup2 == -1) error->all(FLERR, "Could not find compute force/tally second group ID");
  groupbit2 = group->bitmask[igroup2];

  scalar_flag = 1;
@ -46,7 +43,7 @@ ComputeForceTally::ComputeForceTally(LAMMPS *lmp, int narg, char **arg) :

  comm_reverse = size_peratom_cols = 3;
  extscalar = 1;
-  peflag = 1;                   // we need Pair::ev_tally() to be run
+  peflag = 1;    // we need Pair::ev_tally() to be run

  did_setup = invoked_peratom = invoked_scalar = -1;
  nmax = -1;
@ -68,17 +65,16 @@ ComputeForceTally::~ComputeForceTally()
 void ComputeForceTally::init()
 {
  if (force->pair == nullptr)
-    error->all(FLERR,"Trying to use compute force/tally without pair style");
+    error->all(FLERR, "Trying to use compute force/tally without pair style");
  else
    force->pair->add_tally_callback(this);

  if (comm->me == 0) {
    if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-      error->warning(FLERR,"Compute force/tally used with incompatible pair style");
+      error->warning(FLERR, "Compute force/tally used with incompatible pair style");

-    if (force->bond || force->angle || force->dihedral
-                    || force->improper || force->kspace)
-      error->warning(FLERR,"Compute force/tally only called from pair style");
+    if (force->bond || force->angle || force->dihedral || force->improper || force->kspace)
+      error->warning(FLERR, "Compute force/tally only called from pair style");
  }
  did_setup = -1;
 }
@ -99,51 +95,48 @@ void ComputeForceTally::pair_setup_callback(int, int)
  if (atom->nmax > nmax) {
    memory->destroy(fatom);
    nmax = atom->nmax;
-    memory->create(fatom,nmax,size_peratom_cols,"force/tally:fatom");
+    memory->create(fatom, nmax, size_peratom_cols, "force/tally:fatom");
    array_atom = fatom;
  }

  // clear storage

-  for (int i=0; i < ntotal; ++i)
-    for (int j=0; j < size_peratom_cols; ++j)
-      fatom[i][j] = 0.0;
+  for (int i = 0; i < ntotal; ++i)
+    for (int j = 0; j < size_peratom_cols; ++j) fatom[i][j] = 0.0;

-  for (int i=0; i < size_peratom_cols; ++i)
-    vector[i] = ftotal[i] = 0.0;
+  for (int i = 0; i < size_peratom_cols; ++i) vector[i] = ftotal[i] = 0.0;

  did_setup = update->ntimestep;
 }

 /* ---------------------------------------------------------------------- */
-void ComputeForceTally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                            double, double, double fpair,
-                                            double dx, double dy, double dz)
+void ComputeForceTally::pair_tally_callback(int i, int j, int nlocal, int newton, double, double,
+                                            double fpair, double dx, double dy, double dz)
 {
-  const int * const mask = atom->mask;
+  const int *const mask = atom->mask;

-  if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
-       || ((mask[i] & groupbit2) && (mask[j] & groupbit))) {
+  if (((mask[i] & groupbit) && (mask[j] & groupbit2)) ||
+      ((mask[i] & groupbit2) && (mask[j] & groupbit))) {

    if (newton || i < nlocal) {
      if (mask[i] & groupbit) {
-        ftotal[0] += fpair*dx;
-        ftotal[1] += fpair*dy;
-        ftotal[2] += fpair*dz;
+        ftotal[0] += fpair * dx;
+        ftotal[1] += fpair * dy;
+        ftotal[2] += fpair * dz;
      }
-      fatom[i][0] += fpair*dx;
-      fatom[i][1] += fpair*dy;
-      fatom[i][2] += fpair*dz;
+      fatom[i][0] += fpair * dx;
+      fatom[i][1] += fpair * dy;
+      fatom[i][2] += fpair * dz;
    }
    if (newton || j < nlocal) {
      if (mask[j] & groupbit) {
-        ftotal[0] -= fpair*dx;
-        ftotal[1] -= fpair*dy;
-        ftotal[2] -= fpair*dz;
+        ftotal[0] -= fpair * dx;
+        ftotal[1] -= fpair * dy;
+        ftotal[2] -= fpair * dz;
      }
-      fatom[j][0] -= fpair*dx;
-      fatom[j][1] -= fpair*dy;
-      fatom[j][2] -= fpair*dz;
+      fatom[j][0] -= fpair * dx;
+      fatom[j][1] -= fpair * dy;
+      fatom[j][2] -= fpair * dz;
    }
  }
 }
@ -152,7 +145,7 @@ void ComputeForceTally::pair_tally_callback(int i, int j, int nlocal, int newton

 int ComputeForceTally::pack_reverse_comm(int n, int first, double *buf)
 {
-  int i,m,last;
+  int i, m, last;

  m = 0;
  last = first + n;
@ -168,7 +161,7 @@ int ComputeForceTally::pack_reverse_comm(int n, int first, double *buf)

 void ComputeForceTally::unpack_reverse_comm(int n, int *list, double *buf)
 {
-  int i,j,m;
+  int i, j, m;

  m = 0;
  for (i = 0; i < n; i++) {
@ -184,15 +177,14 @@ void ComputeForceTally::unpack_reverse_comm(int n, int *list, double *buf)
 double ComputeForceTally::compute_scalar()
 {
  invoked_scalar = update->ntimestep;
-  if ((did_setup != invoked_scalar)
-      || (update->eflag_global != invoked_scalar))
-    error->all(FLERR,"Energy was not tallied on needed timestep");
+  if ((did_setup != invoked_scalar) || (update->eflag_global != invoked_scalar))
+    error->all(FLERR, "Energy was not tallied on needed timestep");

  // sum accumulated forces across procs

-  MPI_Allreduce(ftotal,vector,size_peratom_cols,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(ftotal, vector, size_peratom_cols, MPI_DOUBLE, MPI_SUM, world);

-  scalar = sqrt(vector[0]*vector[0]+vector[1]*vector[1]+vector[2]*vector[2]);
+  scalar = sqrt(vector[0] * vector[0] + vector[1] * vector[1] + vector[2] * vector[2]);
  return scalar;
 }

@ -201,9 +193,8 @@ double ComputeForceTally::compute_scalar()
 void ComputeForceTally::compute_peratom()
 {
  invoked_peratom = update->ntimestep;
-  if ((did_setup != invoked_peratom)
-      || (update->eflag_global != invoked_peratom))
-    error->all(FLERR,"Energy was not tallied on needed timestep");
+  if ((did_setup != invoked_peratom) || (update->eflag_global != invoked_peratom))
+    error->all(FLERR, "Energy was not tallied on needed timestep");

  // collect contributions from ghost atoms

@ -213,8 +204,7 @@ void ComputeForceTally::compute_peratom()
    // clear out ghost atom data after it has been collected to local atoms
    const int nall = atom->nlocal + atom->nghost;
    for (int i = atom->nlocal; i < nall; ++i)
-      for (int j = 0; j < size_peratom_cols; ++j)
-        fatom[i][j] = 0.0;
+      for (int j = 0; j < size_peratom_cols; ++j) fatom[i][j] = 0.0;
  }
 }

@ -224,7 +214,6 @@ void ComputeForceTally::compute_peratom()

 double ComputeForceTally::memory_usage()
 {
-  double bytes = (nmax < 0) ? 0 : nmax*size_peratom_cols * sizeof(double);
+  double bytes = (nmax < 0) ? 0 : nmax * (double)size_peratom_cols * sizeof(double);
  return bytes;
 }
-
--- a/src/TALLY/compute_heat_flux_tally.cpp
+++ b/src/TALLY/compute_heat_flux_tally.cpp
@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
@ -15,26 +14,25 @@
 #include "compute_heat_flux_tally.h"

 #include "atom.h"
-#include "group.h"
-#include "pair.h"
-#include "update.h"
-#include "memory.h"
+#include "comm.h"
 #include "error.h"
 #include "force.h"
-#include "comm.h"
+#include "group.h"
+#include "memory.h"
+#include "pair.h"
+#include "update.h"

 using namespace LAMMPS_NS;

 /* ---------------------------------------------------------------------- */

 ComputeHeatFluxTally::ComputeHeatFluxTally(LAMMPS *lmp, int narg, char **arg) :
-  Compute(lmp, narg, arg)
+    Compute(lmp, narg, arg)
 {
-  if (narg < 4) error->all(FLERR,"Illegal compute heat/flux/tally command");
+  if (narg < 4) error->all(FLERR, "Illegal compute heat/flux/tally command");

  igroup2 = group->find(arg[3]);
-  if (igroup2 == -1)
-    error->all(FLERR,"Could not find compute heat/flux/tally second group ID");
+  if (igroup2 == -1) error->all(FLERR, "Could not find compute heat/flux/tally second group ID");
  groupbit2 = group->bitmask[igroup2];

  vector_flag = 1;
@ -44,7 +42,7 @@ ComputeHeatFluxTally::ComputeHeatFluxTally(LAMMPS *lmp, int narg, char **arg) :
  comm_reverse = 7;
  extvector = 1;
  size_vector = 6;
-  peflag = 1;                   // we need Pair::ev_tally() to be run
+  peflag = 1;    // we need Pair::ev_tally() to be run

  did_setup = 0;
  invoked_peratom = invoked_scalar = -1;
@ -71,17 +69,16 @@ ComputeHeatFluxTally::~ComputeHeatFluxTally()
 void ComputeHeatFluxTally::init()
 {
  if (force->pair == nullptr)
-    error->all(FLERR,"Trying to use compute heat/flux/tally without pair style");
+    error->all(FLERR, "Trying to use compute heat/flux/tally without pair style");
  else
    force->pair->add_tally_callback(this);

  if (comm->me == 0) {
    if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-      error->warning(FLERR,"Compute heat/flux/tally used with incompatible pair style");
+      error->warning(FLERR, "Compute heat/flux/tally used with incompatible pair style");

-    if (force->bond || force->angle || force->dihedral
-                    || force->improper || force->kspace)
-      error->warning(FLERR,"Compute heat/flux/tally only called from pair style");
+    if (force->bond || force->angle || force->dihedral || force->improper || force->kspace)
+      error->warning(FLERR, "Compute heat/flux/tally only called from pair style");
  }
  did_setup = -1;
 }
@ -102,13 +99,13 @@ void ComputeHeatFluxTally::pair_setup_callback(int, int)
    memory->destroy(stress);
    memory->destroy(eatom);
    nmax = atom->nmax;
-    memory->create(stress,nmax,6,"heat/flux/tally:stress");
-    memory->create(eatom,nmax,"heat/flux/tally:eatom");
+    memory->create(stress, nmax, 6, "heat/flux/tally:stress");
+    memory->create(eatom, nmax, "heat/flux/tally:eatom");
  }

  // clear storage

-  for (int i=0; i < ntotal; ++i) {
+  for (int i = 0; i < ntotal; ++i) {
    eatom[i] = 0.0;
    stress[i][0] = 0.0;
    stress[i][1] = 0.0;
@ -118,30 +115,29 @@ void ComputeHeatFluxTally::pair_setup_callback(int, int)
    stress[i][5] = 0.0;
  }

-  for (int i=0; i < size_vector; ++i)
-    vector[i] = heatj[i] = 0.0;
+  for (int i = 0; i < size_vector; ++i) vector[i] = heatj[i] = 0.0;

  did_setup = update->ntimestep;
 }

 /* ---------------------------------------------------------------------- */
-void ComputeHeatFluxTally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                             double evdwl, double ecoul, double fpair,
-                                             double dx, double dy, double dz)
+void ComputeHeatFluxTally::pair_tally_callback(int i, int j, int nlocal, int newton, double evdwl,
+                                               double ecoul, double fpair, double dx, double dy,
+                                               double dz)
 {
-  const int * const mask = atom->mask;
+  const int *const mask = atom->mask;

-  if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
-       || ((mask[i] & groupbit2) && (mask[j] & groupbit))) {
+  if (((mask[i] & groupbit) && (mask[j] & groupbit2)) ||
+      ((mask[i] & groupbit2) && (mask[j] & groupbit))) {

    const double epairhalf = 0.5 * (evdwl + ecoul);
    fpair *= 0.5;
-    const double v0 = dx*dx*fpair;  // dx*fpair = Fij_x
-    const double v1 = dy*dy*fpair;
-    const double v2 = dz*dz*fpair;
-    const double v3 = dx*dy*fpair;
-    const double v4 = dx*dz*fpair;
-    const double v5 = dy*dz*fpair;
+    const double v0 = dx * dx * fpair;    // dx*fpair = Fij_x
+    const double v1 = dy * dy * fpair;
+    const double v2 = dz * dz * fpair;
+    const double v3 = dx * dy * fpair;
+    const double v4 = dx * dz * fpair;
+    const double v5 = dy * dz * fpair;

    if (newton || i < nlocal) {
      eatom[i] += epairhalf;
@ -168,7 +164,7 @@ void ComputeHeatFluxTally::pair_tally_callback(int i, int j, int nlocal, int new

 int ComputeHeatFluxTally::pack_reverse_comm(int n, int first, double *buf)
 {
-  int i,m,last;
+  int i, m, last;

  m = 0;
  last = first + n;
@ -188,7 +184,7 @@ int ComputeHeatFluxTally::pack_reverse_comm(int n, int first, double *buf)

 void ComputeHeatFluxTally::unpack_reverse_comm(int n, int *list, double *buf)
 {
-  int i,j,m;
+  int i, j, m;

  m = 0;
  for (i = 0; i < n; i++) {
@ -209,7 +205,7 @@ void ComputeHeatFluxTally::compute_vector()
 {
  invoked_vector = update->ntimestep;
  if ((did_setup != invoked_vector) || (update->eflag_global != invoked_vector))
-    error->all(FLERR,"Energy was not tallied on needed timestep");
+    error->all(FLERR, "Energy was not tallied on needed timestep");

  // collect contributions from ghost atoms

@ -244,26 +240,28 @@ void ComputeHeatFluxTally::compute_vector()
  double *rmass = atom->rmass;
  int *type = atom->type;

-  double jc[3] = {0.0,0.0,0.0};
-  double jv[3] = {0.0,0.0,0.0};
+  double jc[3] = {0.0, 0.0, 0.0};
+  double jv[3] = {0.0, 0.0, 0.0};

  for (int i = 0; i < nlocal; i++) {
    if (mask[i] & groupbit) {
-      const double * const vi = v[i];
-      const double * const si = stress[i];
+      const double *const vi = v[i];
+      const double *const si = stress[i];
      double ke_i;

-      if (rmass) ke_i = pfactor * rmass[i];
-      else ke_i = pfactor * mass[type[i]];
-      ke_i *= (vi[0]*vi[0] + vi[1]*vi[1] + vi[2]*vi[2]);
+      if (rmass)
+        ke_i = pfactor * rmass[i];
+      else
+        ke_i = pfactor * mass[type[i]];
+      ke_i *= (vi[0] * vi[0] + vi[1] * vi[1] + vi[2] * vi[2]);
      ke_i += eatom[i];

-      jc[0] += ke_i*vi[0];
-      jc[1] += ke_i*vi[1];
-      jc[2] += ke_i*vi[2];
-      jv[0] += si[0]*vi[0] + si[3]*vi[1] + si[4]*vi[2];
-      jv[1] += si[3]*vi[0] + si[1]*vi[1] + si[5]*vi[2];
-      jv[2] += si[4]*vi[0] + si[5]*vi[1] + si[2]*vi[2];
+      jc[0] += ke_i * vi[0];
+      jc[1] += ke_i * vi[1];
+      jc[2] += ke_i * vi[2];
+      jv[0] += si[0] * vi[0] + si[3] * vi[1] + si[4] * vi[2];
+      jv[1] += si[3] * vi[0] + si[1] * vi[1] + si[5] * vi[2];
+      jv[2] += si[4] * vi[0] + si[5] * vi[1] + si[2] * vi[2];
    }
  }

@ -274,7 +272,7 @@ void ComputeHeatFluxTally::compute_vector()
  heatj[3] = jc[0];
  heatj[4] = jc[1];
  heatj[5] = jc[2];
-  MPI_Allreduce(heatj,vector,size_vector,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(heatj, vector, size_vector, MPI_DOUBLE, MPI_SUM, world);
 }

 /* ----------------------------------------------------------------------
@ -283,7 +281,6 @@ void ComputeHeatFluxTally::compute_vector()

 double ComputeHeatFluxTally::memory_usage()
 {
-  double bytes = (nmax < 0) ? 0 : nmax*comm_reverse * sizeof(double);
+  double bytes = (nmax < 0) ? 0 : nmax * (double)comm_reverse * sizeof(double);
  return bytes;
 }
-
--- a/src/TALLY/compute_heat_flux_virial_tally.cpp
+++ b/src/TALLY/compute_heat_flux_virial_tally.cpp
@ -233,6 +233,6 @@ void ComputeHeatFluxVirialTally::compute_peratom()

 double ComputeHeatFluxVirialTally::memory_usage()
 {
-  double bytes = (nmax < 0) ? 0 : nmax * size_peratom_cols * sizeof(double);
+  double bytes = (nmax < 0) ? 0 : nmax * (double)size_peratom_cols * sizeof(double);
  return bytes;
 }
--- a/src/TALLY/compute_pe_mol_tally.cpp
+++ b/src/TALLY/compute_pe_mol_tally.cpp
@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
@ -15,25 +14,23 @@
 #include "compute_pe_mol_tally.h"

 #include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
 #include "group.h"
 #include "pair.h"
 #include "update.h"
-#include "error.h"
-#include "force.h"
-#include "comm.h"

 using namespace LAMMPS_NS;

 /* ---------------------------------------------------------------------- */

-ComputePEMolTally::ComputePEMolTally(LAMMPS *lmp, int narg, char **arg) :
-  Compute(lmp, narg, arg)
+ComputePEMolTally::ComputePEMolTally(LAMMPS *lmp, int narg, char **arg) : Compute(lmp, narg, arg)
 {
-  if (narg < 4) error->all(FLERR,"Illegal compute pe/mol/tally command");
+  if (narg < 4) error->all(FLERR, "Illegal compute pe/mol/tally command");

  igroup2 = group->find(arg[3]);
-  if (igroup2 == -1)
-    error->all(FLERR,"Could not find compute pe/mol/tally second group ID");
+  if (igroup2 == -1) error->all(FLERR, "Could not find compute pe/mol/tally second group ID");
  groupbit2 = group->bitmask[igroup2];

  vector_flag = 1;
@ -42,7 +39,7 @@ ComputePEMolTally::ComputePEMolTally(LAMMPS *lmp, int narg, char **arg) :
  dynamic_group_allow = 0;

  extvector = 1;
-  peflag = 1;                   // we need Pair::ev_tally() to be run
+  peflag = 1;    // we need Pair::ev_tally() to be run

  did_setup = invoked_vector = -1;
  vector = new double[size_vector];
@ -61,20 +58,18 @@ ComputePEMolTally::~ComputePEMolTally()
 void ComputePEMolTally::init()
 {
  if (force->pair == nullptr)
-    error->all(FLERR,"Trying to use compute pe/mol/tally without pair style");
+    error->all(FLERR, "Trying to use compute pe/mol/tally without pair style");
  else
    force->pair->add_tally_callback(this);

-  if (atom->molecule_flag == 0)
-    error->all(FLERR,"Compute pe/mol/tally requires molecule IDs");
+  if (atom->molecule_flag == 0) error->all(FLERR, "Compute pe/mol/tally requires molecule IDs");

  if (comm->me == 0) {
    if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-      error->warning(FLERR,"Compute pe/mol/tally used with incompatible pair style");
+      error->warning(FLERR, "Compute pe/mol/tally used with incompatible pair style");

-    if (force->bond || force->angle || force->dihedral
-                    || force->improper || force->kspace)
-      error->warning(FLERR,"Compute pe/mol/tally only called from pair style");
+    if (force->bond || force->angle || force->dihedral || force->improper || force->kspace)
+      error->warning(FLERR, "Compute pe/mol/tally only called from pair style");
  }
  did_setup = -1;
 }
@ -93,29 +88,33 @@ void ComputePEMolTally::pair_setup_callback(int, int)
 }

 /* ---------------------------------------------------------------------- */
-void ComputePEMolTally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                         double evdwl, double ecoul, double,
-                                         double, double, double)
+void ComputePEMolTally::pair_tally_callback(int i, int j, int nlocal, int newton, double evdwl,
+                                            double ecoul, double, double, double, double)
 {
-  const int * const mask = atom->mask;
-  const tagint * const molid = atom->molecule;
+  const int *const mask = atom->mask;
+  const tagint *const molid = atom->molecule;

-  if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
-     || ((mask[i] & groupbit2) && (mask[j] & groupbit))) {
+  if (((mask[i] & groupbit) && (mask[j] & groupbit2)) ||
+      ((mask[i] & groupbit2) && (mask[j] & groupbit))) {

-    evdwl *= 0.5; ecoul *= 0.5;
+    evdwl *= 0.5;
+    ecoul *= 0.5;
    if (newton || i < nlocal) {
      if (molid[i] == molid[j]) {
-        etotal[0] += evdwl; etotal[1] += ecoul;
+        etotal[0] += evdwl;
+        etotal[1] += ecoul;
      } else {
-        etotal[2] += evdwl; etotal[3] += ecoul;
+        etotal[2] += evdwl;
+        etotal[3] += ecoul;
      }
    }
    if (newton || j < nlocal) {
      if (molid[i] == molid[j]) {
-        etotal[0] += evdwl; etotal[1] += ecoul;
+        etotal[0] += evdwl;
+        etotal[1] += ecoul;
      } else {
-        etotal[2] += evdwl; etotal[3] += ecoul;
+        etotal[2] += evdwl;
+        etotal[3] += ecoul;
      }
    }
  }
@ -127,10 +126,9 @@ void ComputePEMolTally::compute_vector()
 {
  invoked_vector = update->ntimestep;
  if ((did_setup != invoked_vector) || (update->eflag_global != invoked_vector))
-    error->all(FLERR,"Energy was not tallied on needed timestep");
+    error->all(FLERR, "Energy was not tallied on needed timestep");

  // sum accumulated energies across procs

-  MPI_Allreduce(etotal,vector,size_vector,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(etotal, vector, size_vector, MPI_DOUBLE, MPI_SUM, world);
 }
-
--- a/src/TALLY/compute_pe_tally.cpp
+++ b/src/TALLY/compute_pe_tally.cpp
@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
@ -15,26 +14,24 @@
 #include "compute_pe_tally.h"

 #include "atom.h"
-#include "group.h"
-#include "pair.h"
-#include "update.h"
-#include "memory.h"
+#include "comm.h"
 #include "error.h"
 #include "force.h"
-#include "comm.h"
+#include "group.h"
+#include "memory.h"
+#include "pair.h"
+#include "update.h"

 using namespace LAMMPS_NS;

 /* ---------------------------------------------------------------------- */

-ComputePETally::ComputePETally(LAMMPS *lmp, int narg, char **arg) :
-  Compute(lmp, narg, arg)
+ComputePETally::ComputePETally(LAMMPS *lmp, int narg, char **arg) : Compute(lmp, narg, arg)
 {
-  if (narg < 4) error->all(FLERR,"Illegal compute pe/tally command");
+  if (narg < 4) error->all(FLERR, "Illegal compute pe/tally command");

  igroup2 = group->find(arg[3]);
-  if (igroup2 == -1)
-    error->all(FLERR,"Could not find compute pe/tally second group ID");
+  if (igroup2 == -1) error->all(FLERR, "Could not find compute pe/tally second group ID");
  groupbit2 = group->bitmask[igroup2];

  scalar_flag = 1;
@ -45,7 +42,7 @@ ComputePETally::ComputePETally(LAMMPS *lmp, int narg, char **arg) :

  comm_reverse = size_peratom_cols = 2;
  extscalar = 1;
-  peflag = 1;                   // we need Pair::ev_tally() to be run
+  peflag = 1;    // we need Pair::ev_tally() to be run

  did_setup = invoked_peratom = invoked_scalar = -1;
  nmax = -1;
@ -67,17 +64,16 @@ ComputePETally::~ComputePETally()
 void ComputePETally::init()
 {
  if (force->pair == nullptr)
-    error->all(FLERR,"Trying to use compute pe/tally without a pair style");
+    error->all(FLERR, "Trying to use compute pe/tally without a pair style");
  else
    force->pair->add_tally_callback(this);

  if (comm->me == 0) {
    if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-      error->warning(FLERR,"Compute pe/tally used with incompatible pair style");
+      error->warning(FLERR, "Compute pe/tally used with incompatible pair style");

-    if (force->bond || force->angle || force->dihedral
-                    || force->improper || force->kspace)
-      error->warning(FLERR,"Compute pe/tally only called from pair style");
+    if (force->bond || force->angle || force->dihedral || force->improper || force->kspace)
+      error->warning(FLERR, "Compute pe/tally only called from pair style");
  }
  did_setup = -1;
 }
@ -98,14 +94,13 @@ void ComputePETally::pair_setup_callback(int, int)
  if (atom->nmax > nmax) {
    memory->destroy(eatom);
    nmax = atom->nmax;
-    memory->create(eatom,nmax,size_peratom_cols,"pe/tally:eatom");
+    memory->create(eatom, nmax, size_peratom_cols, "pe/tally:eatom");
    array_atom = eatom;
  }

  // clear storage

-  for (int i=0; i < ntotal; ++i)
-    eatom[i][0] = eatom[i][1] = 0.0;
+  for (int i = 0; i < ntotal; ++i) eatom[i][0] = eatom[i][1] = 0.0;

  vector[0] = etotal[0] = vector[1] = etotal[1] = 0.0;

@ -113,23 +108,27 @@ void ComputePETally::pair_setup_callback(int, int)
 }

 /* ---------------------------------------------------------------------- */
-void ComputePETally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                         double evdwl, double ecoul, double,
-                                         double, double, double)
+void ComputePETally::pair_tally_callback(int i, int j, int nlocal, int newton, double evdwl,
+                                         double ecoul, double, double, double, double)
 {
-  const int * const mask = atom->mask;
+  const int *const mask = atom->mask;

-  if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
-       || ((mask[i] & groupbit2) && (mask[j] & groupbit))) {
+  if (((mask[i] & groupbit) && (mask[j] & groupbit2)) ||
+      ((mask[i] & groupbit2) && (mask[j] & groupbit))) {

-    evdwl *= 0.5; ecoul *= 0.5;
+    evdwl *= 0.5;
+    ecoul *= 0.5;
    if (newton || i < nlocal) {
-      etotal[0] += evdwl; eatom[i][0] += evdwl;
-      etotal[1] += ecoul; eatom[i][1] += ecoul;
+      etotal[0] += evdwl;
+      eatom[i][0] += evdwl;
+      etotal[1] += ecoul;
+      eatom[i][1] += ecoul;
    }
    if (newton || j < nlocal) {
-      etotal[0] += evdwl; eatom[j][0] += evdwl;
-      etotal[1] += ecoul; eatom[j][1] += ecoul;
+      etotal[0] += evdwl;
+      eatom[j][0] += evdwl;
+      etotal[1] += ecoul;
+      eatom[j][1] += ecoul;
    }
  }
 }
@ -138,7 +137,7 @@ void ComputePETally::pair_tally_callback(int i, int j, int nlocal, int newton,

 int ComputePETally::pack_reverse_comm(int n, int first, double *buf)
 {
-  int i,m,last;
+  int i, m, last;

  m = 0;
  last = first + n;
@ -153,7 +152,7 @@ int ComputePETally::pack_reverse_comm(int n, int first, double *buf)

 void ComputePETally::unpack_reverse_comm(int n, int *list, double *buf)
 {
-  int i,j,m;
+  int i, j, m;

  m = 0;
  for (i = 0; i < n; i++) {
@ -168,15 +167,14 @@ void ComputePETally::unpack_reverse_comm(int n, int *list, double *buf)
 double ComputePETally::compute_scalar()
 {
  invoked_scalar = update->ntimestep;
-  if ((did_setup != invoked_scalar)
-      || (update->eflag_global != invoked_scalar))
-    error->all(FLERR,"Energy was not tallied on needed timestep");
+  if ((did_setup != invoked_scalar) || (update->eflag_global != invoked_scalar))
+    error->all(FLERR, "Energy was not tallied on needed timestep");

  // sum accumulated energies across procs

-  MPI_Allreduce(etotal,vector,size_peratom_cols,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(etotal, vector, size_peratom_cols, MPI_DOUBLE, MPI_SUM, world);

-  scalar = vector[0]+vector[1];
+  scalar = vector[0] + vector[1];
  return scalar;
 }

@ -185,9 +183,8 @@ double ComputePETally::compute_scalar()
 void ComputePETally::compute_peratom()
 {
  invoked_peratom = update->ntimestep;
-  if ((did_setup != invoked_peratom)
-      || (update->eflag_global != invoked_peratom))
-    error->all(FLERR,"Energy was not tallied on needed timestep");
+  if ((did_setup != invoked_peratom) || (update->eflag_global != invoked_peratom))
+    error->all(FLERR, "Energy was not tallied on needed timestep");

  // collect contributions from ghost atoms

@ -196,8 +193,7 @@ void ComputePETally::compute_peratom()

    // clear out ghost atom data after it has been collected to local atoms
    const int nall = atom->nlocal + atom->nghost;
-    for (int i = atom->nlocal; i < nall; ++i)
-      eatom[i][0] = eatom[i][1] = 0.0;
+    for (int i = atom->nlocal; i < nall; ++i) eatom[i][0] = eatom[i][1] = 0.0;
  }
 }

@ -207,7 +203,6 @@ void ComputePETally::compute_peratom()

 double ComputePETally::memory_usage()
 {
-  double bytes = (nmax < 0) ? 0 : nmax*size_peratom_cols * sizeof(double);
+  double bytes = (nmax < 0) ? 0 : nmax * (double)size_peratom_cols * sizeof(double);
  return bytes;
 }
-
--- a/src/TALLY/compute_stress_tally.cpp
+++ b/src/TALLY/compute_stress_tally.cpp
@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
@ -15,27 +14,25 @@
 #include "compute_stress_tally.h"

 #include "atom.h"
-#include "group.h"
-#include "pair.h"
-#include "update.h"
-#include "memory.h"
-#include "error.h"
-#include "force.h"
 #include "comm.h"
 #include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "group.h"
+#include "memory.h"
+#include "pair.h"
+#include "update.h"

 using namespace LAMMPS_NS;

 /* ---------------------------------------------------------------------- */

-ComputeStressTally::ComputeStressTally(LAMMPS *lmp, int narg, char **arg) :
-  Compute(lmp, narg, arg)
+ComputeStressTally::ComputeStressTally(LAMMPS *lmp, int narg, char **arg) : Compute(lmp, narg, arg)
 {
-  if (narg < 4) error->all(FLERR,"Illegal compute stress/tally command");
+  if (narg < 4) error->all(FLERR, "Illegal compute stress/tally command");

  igroup2 = group->find(arg[3]);
-  if (igroup2 == -1)
-    error->all(FLERR,"Could not find compute stress/tally second group ID");
+  if (igroup2 == -1) error->all(FLERR, "Could not find compute stress/tally second group ID");
  groupbit2 = group->bitmask[igroup2];

  scalar_flag = 1;
@ -46,7 +43,7 @@ ComputeStressTally::ComputeStressTally(LAMMPS *lmp, int narg, char **arg) :

  comm_reverse = size_peratom_cols = 6;
  extscalar = 0;
-  peflag = 1;                   // we need Pair::ev_tally() to be run
+  peflag = 1;    // we need Pair::ev_tally() to be run

  did_setup = invoked_peratom = invoked_scalar = -1;
  nmax = -1;
@ -70,17 +67,16 @@ ComputeStressTally::~ComputeStressTally()
 void ComputeStressTally::init()
 {
  if (force->pair == nullptr)
-    error->all(FLERR,"Trying to use compute stress/tally without pair style");
+    error->all(FLERR, "Trying to use compute stress/tally without pair style");
  else
    force->pair->add_tally_callback(this);

  if (comm->me == 0) {
    if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-      error->warning(FLERR,"Compute stress/tally used with incompatible pair style");
+      error->warning(FLERR, "Compute stress/tally used with incompatible pair style");

-    if (force->bond || force->angle || force->dihedral
-                    || force->improper || force->kspace)
-      error->warning(FLERR,"Compute stress/tally only called from pair style");
+    if (force->bond || force->angle || force->dihedral || force->improper || force->kspace)
+      error->warning(FLERR, "Compute stress/tally only called from pair style");
  }
  did_setup = -1;
 }
@ -101,55 +97,64 @@ void ComputeStressTally::pair_setup_callback(int, int)
  if (atom->nmax > nmax) {
    memory->destroy(stress);
    nmax = atom->nmax;
-    memory->create(stress,nmax,size_peratom_cols,"stress/tally:stress");
+    memory->create(stress, nmax, size_peratom_cols, "stress/tally:stress");
    array_atom = stress;
  }

  // clear storage

-  for (int i=0; i < ntotal; ++i)
-    for (int j=0; j < size_peratom_cols; ++j)
-      stress[i][j] = 0.0;
+  for (int i = 0; i < ntotal; ++i)
+    for (int j = 0; j < size_peratom_cols; ++j) stress[i][j] = 0.0;

-  for (int i=0; i < size_peratom_cols; ++i)
-    vector[i] = virial[i] = 0.0;
+  for (int i = 0; i < size_peratom_cols; ++i) vector[i] = virial[i] = 0.0;

  did_setup = update->ntimestep;
 }

 /* ---------------------------------------------------------------------- */
-void ComputeStressTally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                             double, double, double fpair,
-                                             double dx, double dy, double dz)
+void ComputeStressTally::pair_tally_callback(int i, int j, int nlocal, int newton, double, double,
+                                             double fpair, double dx, double dy, double dz)
 {
-  const int * const mask = atom->mask;
+  const int *const mask = atom->mask;

-  if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
-       || ((mask[i] & groupbit2) && (mask[j] & groupbit))) {
+  if (((mask[i] & groupbit) && (mask[j] & groupbit2)) ||
+      ((mask[i] & groupbit2) && (mask[j] & groupbit))) {

    fpair *= 0.5;
-    const double v0 = dx*dx*fpair;
-    const double v1 = dy*dy*fpair;
-    const double v2 = dz*dz*fpair;
-    const double v3 = dx*dy*fpair;
-    const double v4 = dx*dz*fpair;
-    const double v5 = dy*dz*fpair;
+    const double v0 = dx * dx * fpair;
+    const double v1 = dy * dy * fpair;
+    const double v2 = dz * dz * fpair;
+    const double v3 = dx * dy * fpair;
+    const double v4 = dx * dz * fpair;
+    const double v5 = dy * dz * fpair;

    if (newton || i < nlocal) {
-      virial[0] += v0; stress[i][0] += v0;
-      virial[1] += v1; stress[i][1] += v1;
-      virial[2] += v2; stress[i][2] += v2;
-      virial[3] += v3; stress[i][3] += v3;
-      virial[4] += v4; stress[i][4] += v4;
-      virial[5] += v5; stress[i][5] += v5;
+      virial[0] += v0;
+      stress[i][0] += v0;
+      virial[1] += v1;
+      stress[i][1] += v1;
+      virial[2] += v2;
+      stress[i][2] += v2;
+      virial[3] += v3;
+      stress[i][3] += v3;
+      virial[4] += v4;
+      stress[i][4] += v4;
+      virial[5] += v5;
+      stress[i][5] += v5;
    }
    if (newton || j < nlocal) {
-      virial[0] += v0; stress[j][0] += v0;
-      virial[1] += v1; stress[j][1] += v1;
-      virial[2] += v2; stress[j][2] += v2;
-      virial[3] += v3; stress[j][3] += v3;
-      virial[4] += v4; stress[j][4] += v4;
-      virial[5] += v5; stress[j][5] += v5;
+      virial[0] += v0;
+      stress[j][0] += v0;
+      virial[1] += v1;
+      stress[j][1] += v1;
+      virial[2] += v2;
+      stress[j][2] += v2;
+      virial[3] += v3;
+      stress[j][3] += v3;
+      virial[4] += v4;
+      stress[j][4] += v4;
+      virial[5] += v5;
+      stress[j][5] += v5;
    }
  }
 }
@ -158,7 +163,7 @@ void ComputeStressTally::pair_tally_callback(int i, int j, int nlocal, int newto

 int ComputeStressTally::pack_reverse_comm(int n, int first, double *buf)
 {
-  int i,m,last;
+  int i, m, last;

  m = 0;
  last = first + n;
@ -177,7 +182,7 @@ int ComputeStressTally::pack_reverse_comm(int n, int first, double *buf)

 void ComputeStressTally::unpack_reverse_comm(int n, int *list, double *buf)
 {
-  int i,j,m;
+  int i, j, m;

  m = 0;
  for (i = 0; i < n; i++) {
@ -196,18 +201,17 @@ void ComputeStressTally::unpack_reverse_comm(int n, int *list, double *buf)
 double ComputeStressTally::compute_scalar()
 {
  invoked_scalar = update->ntimestep;
-  if ((did_setup != invoked_scalar)
-      || (update->eflag_global != invoked_scalar))
-    error->all(FLERR,"Energy was not tallied on needed timestep");
+  if ((did_setup != invoked_scalar) || (update->eflag_global != invoked_scalar))
+    error->all(FLERR, "Energy was not tallied on needed timestep");

  // sum accumulated forces across procs

-  MPI_Allreduce(virial,vector,size_peratom_cols,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(virial, vector, size_peratom_cols, MPI_DOUBLE, MPI_SUM, world);

  if (domain->dimension == 3)
-    scalar = (vector[0]+vector[1]+vector[2])/3.0;
+    scalar = (vector[0] + vector[1] + vector[2]) / 3.0;
  else
-    scalar = (vector[0]+vector[1])/2.0;
+    scalar = (vector[0] + vector[1]) / 2.0;

  return scalar;
 }
@ -217,9 +221,8 @@ double ComputeStressTally::compute_scalar()
 void ComputeStressTally::compute_peratom()
 {
  invoked_peratom = update->ntimestep;
-  if ((did_setup != invoked_peratom)
-      || (update->eflag_global != invoked_peratom))
-    error->all(FLERR,"Energy was not tallied on needed timestep");
+  if ((did_setup != invoked_peratom) || (update->eflag_global != invoked_peratom))
+    error->all(FLERR, "Energy was not tallied on needed timestep");

  // collect contributions from ghost atoms

@ -228,8 +231,7 @@ void ComputeStressTally::compute_peratom()

    const int nall = atom->nlocal + atom->nghost;
    for (int i = atom->nlocal; i < nall; ++i)
-      for (int j = 0; j < size_peratom_cols; ++j)
-        stress[i][j] = 0.0;
+      for (int j = 0; j < size_peratom_cols; ++j) stress[i][j] = 0.0;
  }

  // convert to stress*volume units = -pressure*volume
@ -251,7 +253,6 @@ void ComputeStressTally::compute_peratom()

 double ComputeStressTally::memory_usage()
 {
-  double bytes = (nmax < 0) ? 0 : nmax*size_peratom_cols * sizeof(double);
+  double bytes = (nmax < 0) ? 0 : nmax * (double)size_peratom_cols * sizeof(double);
  return bytes;
 }
-
--- a/src/EXTRA-COMPUTE/compute_aggregate_atom.cpp
+++ b/src/EXTRA-COMPUTE/compute_aggregate_atom.cpp
--- a/src/EXTRA-COMPUTE/compute_aggregate_atom.h
+++ b/src/EXTRA-COMPUTE/compute_aggregate_atom.h
--- a/src/EXTRA-COMPUTE/compute_cluster_atom.cpp
+++ b/src/EXTRA-COMPUTE/compute_cluster_atom.cpp
--- a/src/EXTRA-COMPUTE/compute_cluster_atom.h
+++ b/src/EXTRA-COMPUTE/compute_cluster_atom.h
--- a/src/EXTRA-COMPUTE/compute_fragment_atom.cpp
+++ b/src/EXTRA-COMPUTE/compute_fragment_atom.cpp
--- a/src/EXTRA-COMPUTE/compute_fragment_atom.h
+++ b/src/EXTRA-COMPUTE/compute_fragment_atom.h
--- a/src/EXTRA-COMPUTE/compute_orientorder_atom.cpp
+++ b/src/EXTRA-COMPUTE/compute_orientorder_atom.cpp
--- a/src/EXTRA-COMPUTE/compute_orientorder_atom.h
+++ b/src/EXTRA-COMPUTE/compute_orientorder_atom.h
--- a/tools/offline/scripts/init_http_cache.sh
+++ b/tools/offline/scripts/init_http_cache.sh
@ -50,7 +50,7 @@ CUB_URL="https://github.com/NVlabs/cub/archive/1.12.0.tar.gz"
 KOKKOS_URL="https://github.com/kokkos/kokkos/archive/3.4.01.tar.gz"
 KIM_URL="https://s3.openkim.org/kim-api/kim-api-2.2.1.txz"
 MSCG_URL="https://github.com/uchicago-voth/MSCG-release/archive/1.7.3.1.tar.gz"
-PLUMED_URL="https://github.com/plumed/plumed2/releases/download/v2.7.1/plumed-src-2.7.1.tgz"
+PLUMED_URL="https://github.com/plumed/plumed2/releases/download/v2.7.2/plumed-src-2.7.2.tgz"
 PACELIB_URL="https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/v.2021.4.9.tar.gz"
 LATTE_URL="https://github.com/lanl/LATTE/archive/v1.2.2.tar.gz"
 SCAFACOS_URL="https://github.com/scafacos/scafacos/releases/download/v1.0.1/scafacos-1.0.1.tar.gz"
--- a/unittest/force-styles/tests/atomic-pair-reaxff.yaml
+++ b/unittest/force-styles/tests/atomic-pair-reaxff.yaml
@ -1,7 +1,7 @@
 ---
 lammps_version: 2 Jul 2021
 date_generated: Wed Jul 21 15:49:45 2021
-epsilon: 1e-11
+epsilon: 2e-11
 prerequisites: ! |
  pair reaxff
  fix qeq/reaxff
--- a/unittest/force-styles/tests/atomic-pair-reaxff_lgvdw.yaml
+++ b/unittest/force-styles/tests/atomic-pair-reaxff_lgvdw.yaml
@ -1,7 +1,7 @@
 ---
 lammps_version: 2 Jul 2021
 date_generated: Wed Jul 21 15:49:47 2021
-epsilon: 1e-12
+epsilon: 3e-12
 prerequisites: ! |
  pair reaxff
  fix qeq/reaxff
--- a/unittest/force-styles/tests/atomic-pair-reaxff_noqeq.yaml
+++ b/unittest/force-styles/tests/atomic-pair-reaxff_noqeq.yaml
--- a/unittest/force-styles/tests/atomic-pair-reaxff_tabulate.yaml
+++ b/unittest/force-styles/tests/atomic-pair-reaxff_tabulate.yaml
--- a/unittest/formats/test_file_operations.cpp
+++ b/unittest/formats/test_file_operations.cpp
@ -281,7 +281,6 @@ TEST_F(FileOperationsTest, error_message_warn)

 TEST_F(FileOperationsTest, error_all_one)
 {
-    char buf[64];
    BEGIN_HIDE_OUTPUT();
    command("echo none");
    command("log none");
--- a/unittest/utils/test_tokenizer.cpp
+++ b/unittest/utils/test_tokenizer.cpp
@ -94,7 +94,8 @@ TEST(Tokenizer, copy_constructor)

 TEST(Tokenizer, move_constructor)
 {
-    Tokenizer u = std::move(Tokenizer("test new word   ", " "));
+    Tokenizer t("test new word   ", " ");
+    Tokenizer u = std::move(t);
    ASSERT_THAT(u.next(), Eq("test"));
    ASSERT_THAT(u.next(), Eq("new"));
    ASSERT_THAT(u.next(), Eq("word"));
@ -248,7 +249,8 @@ TEST(ValueTokenizer, copy_constructor)

 TEST(ValueTokenizer, move_constructor)
 {
-    ValueTokenizer u = std::move(ValueTokenizer("  test new word   ", " "));
+    ValueTokenizer t("  test new word   ", " ");
+    ValueTokenizer u = std::move(t);
    ASSERT_THAT(u.next_string(), Eq("test"));
    ASSERT_THAT(u.next_string(), Eq("new"));
    ASSERT_THAT(u.next_string(), Eq("word"));