Converting cilk vectorization directives to the openmp standard and changing some more depracated vector intrinsics. Data alignment directives for compiler vectorization are still mostly intel specific.

2021-07-26 11:22:21 -07:00
parent 1345c25f41
commit fefcd0e2af
33 changed files with 1013 additions and 189 deletions
--- a/src/INTEL/angle_charmm_intel.cpp
+++ b/src/INTEL/angle_charmm_intel.cpp
@ -162,7 +162,11 @@ void AngleCharmmIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n ++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -246,7 +250,11 @@ void AngleCharmmIntel::eval(const int vflag,
      // apply force to each of 3 atoms

      #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/angle_harmonic_intel.cpp
+++ b/src/INTEL/angle_harmonic_intel.cpp
@ -162,7 +162,11 @@ void AngleHarmonicIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n ++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -228,7 +232,11 @@ void AngleHarmonicIntel::eval(const int vflag,
      // apply force to each of 3 atoms

      #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/bond_fene_intel.cpp
+++ b/src/INTEL/bond_fene_intel.cpp
@ -158,7 +158,11 @@ void BondFENEIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n ++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -215,7 +219,11 @@ void BondFENEIntel::eval(const int vflag,
      // apply force to each of 2 atoms

      #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/bond_harmonic_intel.cpp
+++ b/src/INTEL/bond_harmonic_intel.cpp
@ -155,7 +155,11 @@ void BondHarmonicIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n ++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -184,7 +188,11 @@ void BondHarmonicIntel::eval(const int vflag,

      // apply force to each of 2 atoms
      #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/dihedral_charmm_intel.cpp
+++ b/src/INTEL/dihedral_charmm_intel.cpp
@ -181,9 +181,16 @@ void DihedralCharmmIntel::eval(const int vflag,
    }

    #if defined(LMP_SIMD_COMPILER_TEST)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
+                               sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, \
+                               spv5)
+#else
    #pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
-                           sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5)
+                           sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, \
+                           spv5)
+#endif
+    #pragma vector aligned
    for (int n = nfrom; n < nto; n++) {
    #endif
    for (int n = nfrom; n < nto; n += npl) {
@ -329,7 +336,11 @@ void DihedralCharmmIntel::eval(const int vflag,


      #if defined(LMP_SIMD_COMPILER_TEST)
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i2 < nlocal) {
@ -408,7 +419,11 @@ void DihedralCharmmIntel::eval(const int vflag,

      // apply force to each of 4 atoms
      #if defined(LMP_SIMD_COMPILER_TEST)
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/dihedral_fourier_intel.cpp
+++ b/src/INTEL/dihedral_fourier_intel.cpp
@ -154,7 +154,11 @@ void DihedralFourierIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n ++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -304,7 +308,11 @@ void DihedralFourierIntel::eval(const int vflag,
      }

      #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/dihedral_harmonic_intel.cpp
+++ b/src/INTEL/dihedral_harmonic_intel.cpp
@ -154,7 +154,11 @@ void DihedralHarmonicIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n ++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -299,7 +303,11 @@ void DihedralHarmonicIntel::eval(const int vflag,
      }

      #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/dihedral_opls_intel.cpp
+++ b/src/INTEL/dihedral_opls_intel.cpp
@ -158,7 +158,11 @@ void DihedralOPLSIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n ++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -319,7 +323,11 @@ void DihedralOPLSIntel::eval(const int vflag,
      }

      #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/fix_intel.cpp
+++ b/src/INTEL/fix_intel.cpp
@ -635,19 +635,31 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
    if (_nthreads == 4) {
      acc_t *f_scalar3 = f_scalar2 + f_stride4;
      acc_t *f_scalar4 = f_scalar3 + f_stride4;
-      _use_simd_pragma("vector aligned")
-      _use_simd_pragma("simd")
+      #if defined(USE_OMP_SIMD)
+      #pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4:64)
+      #elif defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma simd
+      #endif
      for (int n = 0; n < o_range; n++)
        f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
    } else if (_nthreads == 2) {
-      _use_simd_pragma("vector aligned")
-      _use_simd_pragma("simd")
+      #if defined(USE_OMP_SIMD)
+      #pragma omp simd aligned(f_scalar,f_scalar2:64)
+      #elif defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma simd
+      #endif
      for (int n = 0; n < o_range; n++)
        f_scalar[n] += f_scalar2[n];
    } else {
      acc_t *f_scalar3 = f_scalar2 + f_stride4;
-      _use_simd_pragma("vector aligned")
-      _use_simd_pragma("simd")
+      #if defined(USE_OMP_SIMD)
+      #pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3:64)
+      #elif defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma simd
+      #endif
      for (int n = 0; n < o_range; n++)
        f_scalar[n] += f_scalar2[n] + f_scalar3[n];
    }
@ -662,8 +674,12 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)

      acc_t *f_scalar2 = f_scalar + f_stride4;
      for (int t = 1; t < _nthreads; t++) {
-        _use_simd_pragma("vector aligned")
-        _use_simd_pragma("simd")
+        #if defined(USE_OMP_SIMD)
+        #pragma omp simd aligned(f_scalar,f_scalar2:64)
+        #elif defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma simd
+        #endif
        for (int n = iifrom; n < iito; n++)
          f_scalar[n] += f_scalar2[n];
        f_scalar2 += f_stride4;
--- a/src/INTEL/fix_nh_intel.cpp
+++ b/src/INTEL/fix_nh_intel.cpp
@ -99,8 +99,12 @@ void FixNHIntel::remap()

  if (allremap) {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
      const double d0 = x[i].x - b0;
@ -112,8 +116,12 @@ void FixNHIntel::remap()
    }
  } else {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
      if (mask[i] & dilate_group_bit) {
@ -278,8 +286,12 @@ void FixNHIntel::remap()

  if (allremap) {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
      x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
@ -288,8 +300,12 @@ void FixNHIntel::remap()
    }
  } else {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
      if (mask[i] & dilate_group_bit) {
@ -415,8 +431,12 @@ void FixNHIntel::nh_v_press()

  if (igroup == 0) {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
      v[i].x *= f0;
@ -425,8 +445,12 @@ void FixNHIntel::nh_v_press()
    }
  } else {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
      if (mask[i] & groupbit) {
@ -448,8 +472,12 @@ void FixNHIntel::nve_v()
  double * _noalias const v = atom->v[0];
  const double * _noalias const f = atom->f[0];
  #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+  #pragma omp simd
+#else
  #pragma simd
+#endif
+  #pragma vector aligned
  #endif
  for (int i = 0; i < _nlocal3; i++)
    v[i] += _dtfm[i] * f[i];
@ -468,15 +496,23 @@ void FixNHIntel::nve_x()

  if (igroup == 0) {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++)
      x[i] += dtv * v[i];
  } else {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++) {
      if (_dtfm[i] != 0.0)
@ -500,15 +536,23 @@ void FixNHIntel::nh_v_temp()

  if (igroup == 0) {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++)
        v[i] *= factor_eta;
  } else {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++) {
      if (_dtfm[i] != 0.0)
--- a/src/INTEL/fix_nve_asphere_intel.cpp
+++ b/src/INTEL/fix_nve_asphere_intel.cpp
@ -97,8 +97,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
  dtq = 0.5 * dtv;

  #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+  #pragma omp simd
+#else
  #pragma simd
+#endif
+  #pragma vector aligned
  #endif
  for (int i = 0; i < _nlocal3; i++) {
    v[i] += _dtfm[i] * f[i];
@ -108,8 +112,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
  // update angular momentum by 1/2 step
  if (igroup == 0) {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
      double *quat = bonus[ellipsoid[i]].quat;
@ -118,8 +126,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
    }
  } else {
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
      if (mask[i] & groupbit) {
@ -143,8 +155,12 @@ void FixNVEAsphereIntel::final_integrate()
  const double * _noalias const torque = atom->torque[0];

  #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+  #pragma omp simd
+#else
  #pragma simd
+#endif
+  #pragma vector aligned
  #endif
  for (int i = 0; i < _nlocal3; i++) {
    v[i] += _dtfm[i] * f[i];
--- a/src/INTEL/fix_nve_intel.cpp
+++ b/src/INTEL/fix_nve_intel.cpp
@ -68,8 +68,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
  if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) {
    const double dtfm = dtf / atom->mass[1];
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++) {
      v[i] += dtfm * f[i];
@ -78,8 +82,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
  } else if (igroup == 0) {
    if (neighbor->ago == 0) reset_dt();
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++) {
      v[i] += _dtfm[i] * f[i];
@ -88,8 +96,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
  } else {
    if (neighbor->ago == 0) reset_dt();
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++) {
      if (_dtfm[i] != 0.0) {
@ -112,16 +124,24 @@ void FixNVEIntel::final_integrate()
    _nlocal3 = 3 * atom->nlocal;
    const double dtfm = dtf / atom->mass[1];
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++)
      v[i] += dtfm * f[i];
  } else if (igroup == 0) {
    if (neighbor->ago == 0) reset_dt();
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++) {
      v[i] += _dtfm[i] * f[i];
@ -129,8 +149,12 @@ void FixNVEIntel::final_integrate()
  } else {
    if (neighbor->ago == 0) reset_dt();
    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
+    #pragma vector aligned
    #endif
    for (int i = 0; i < _nlocal3; i++)
      v[i] += _dtfm[i] * f[i];
--- a/src/INTEL/improper_cvff_intel.cpp
+++ b/src/INTEL/improper_cvff_intel.cpp
@ -165,7 +165,11 @@ void ImproperCvffIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -247,7 +251,11 @@ void ImproperCvffIntel::eval(const int vflag,

      flt_t p, pd;
      #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (m == 2) {
@ -319,7 +327,11 @@ void ImproperCvffIntel::eval(const int vflag,
      // apply force to each of 4 atoms

      #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/improper_harmonic_intel.cpp
+++ b/src/INTEL/improper_harmonic_intel.cpp
@ -167,7 +167,11 @@ void ImproperHarmonicIntel::eval(const int vflag,
    if (VFLAG && vflag) {
      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
    }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
    #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
    for (int n = nfrom; n < nto; n++) {
    #else
    for (int n = nfrom; n < nto; n += npl) {
@ -276,7 +280,11 @@ void ImproperHarmonicIntel::eval(const int vflag,
      // apply force to each of 4 atoms

      #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
      #pragma simdoff
+#endif
      #endif
      {
        if (NEWTON_BOND || i1 < nlocal) {
--- a/src/INTEL/intel_intrinsics.h
+++ b/src/INTEL/intel_intrinsics.h
@ -127,7 +127,8 @@ struct vector_ops<double, KNC> {
    }
    template<int scale>
    static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) {
-      return _mm512_mask_i32logather_pd(from, mask, idx, base, scale);
+      return _mm512_mask_i32gather_pd(from, mask, _mm512_castsi512_si256(idx),
+                                      base, scale);
    }
    static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
      return _mm512_mask_blend_pd(mask, a, b);
--- a/src/INTEL/intel_intrinsics_airebo.h
+++ b/src/INTEL/intel_intrinsics_airebo.h
@ -511,7 +511,8 @@ public:
                                     const int scale) {
    assert(scale == sizeof(FVEC_SCAL_T));
 #   if FVEC_LEN==8
-    return FVEC_SUFFIX(_mm512_i32logather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
+    return FVEC_SUFFIX(_mm512_i32gather_)(_mm512_castsi512_si256(idx.val_),
+                                          mem, sizeof(FVEC_SCAL_T));
 #   else
    return FVEC_SUFFIX(_mm512_i32gather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
 #   endif
@ -522,8 +523,8 @@ public:
  ) {
    assert(scale == sizeof(FVEC_SCAL_T));
 #   if FVEC_LEN==8
-    return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
-                       mem, sizeof(FVEC_SCAL_T));
+    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_,
+                _mm512_castsi512_si256(idx.val_), mem, sizeof(FVEC_SCAL_T));
 #   else
    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
                       mem, sizeof(FVEC_SCAL_T));
@ -609,8 +610,8 @@ public:
  ) {
    assert(scale == sizeof(FVEC_SCAL_T));
 #   if FVEC_LEN==8
-    return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
-                                                 mem, sizeof(FVEC_SCAL_T));
+    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_,
+              _mm512_castsi512_si256(idx.val_), mem, sizeof(FVEC_SCAL_T));
 #   else
    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
                                               mem, sizeof(FVEC_SCAL_T));
@ -622,8 +623,9 @@ public:
  ) {
    assert(scale == sizeof(FVEC_SCAL_T));
 #   if FVEC_LEN==8
-    FVEC_SUFFIX(_mm512_mask_i32loscatter_)(mem, mask.val_, idx.val_, a.val_,
-                                           sizeof(FVEC_SCAL_T));
+    FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, 
+                                         _mm512_castsi512_si256(idx.val_),
+                                         a.val_, sizeof(FVEC_SCAL_T));
 #   else
    FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, idx.val_, a.val_,
                                         sizeof(FVEC_SCAL_T));
@ -666,11 +668,11 @@ public:
      const double * mem, const int scale
  ) {
    assert(scale == sizeof(double));
-    __m512d lo = _mm512_mask_i32logather_pd(src.lo_, mask.val_, idx.val_, mem,
-                                            sizeof(double));
-    __m512d hi = _mm512_mask_i32logather_pd(src.hi_, get_bvec_hi(mask.val_),
-                                            get_ivec_hi(idx.val_), mem,
-                                            sizeof(double));
+    __m512d lo = _mm512_mask_i32gather_pd(src.lo_, mask.val_, 
+                                          _mm512_castsi512_si256(idx.val_),
+                                          mem, sizeof(double));
+    __m512d hi = _mm512_mask_i32gather_pd(src.hi_, get_bvec_hi(mask.val_),
+         _mm512_castsi512_si256(get_ivec_hi(idx.val_)), mem, sizeof(double));
    return avec16pd(lo, hi);
  }
  VEC_INLINE static void mask_i32loscatter(
@ -678,10 +680,12 @@ public:
      const avec16pd &a, const int scale
  ) {
    assert(scale == sizeof(double));
-    _mm512_mask_i32loscatter_pd(mem, mask.val_, idx.val_, a.lo_,
+    _mm512_mask_i32scatter_pd(mem, mask.val_,
+                              _mm512_castsi512_si256(idx.val_), a.lo_,
                              sizeof(double));
-    _mm512_mask_i32loscatter_pd(mem, get_bvec_hi(mask.val_),
-                                get_ivec_hi(idx.val_), a.hi_, sizeof(double));
+    _mm512_mask_i32scatter_pd(mem, get_bvec_hi(mask.val_),
+                              _mm512_castsi512_si256(get_ivec_hi(idx.val_)),
+                              a.hi_, sizeof(double));
  }

  #define AVEC2_BINOP(the_sym, the_name)                                    \
--- a/src/INTEL/intel_preprocess.h
+++ b/src/INTEL/intel_preprocess.h
@ -17,8 +17,13 @@
 ------------------------------------------------------------------------- */

 #ifdef __INTEL_LLVM_COMPILER
+#define USE_OMP_SIMD
 #define __INTEL_COMPILER __INTEL_LLVM_COMPILER
 #define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER
+#define _MM_SCALE_1 1
+#define _MM_SCALE_2 2
+#define _MM_SCALE_4 4
+#define _MM_SCALE_8 8
 #endif

 #ifdef __INTEL_COMPILER
@ -332,6 +337,9 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,

 #endif

+// TO BE DEPRECATED
+#ifndef USE_OMP_SIMD 
+
 #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
                                  f_stride, pos, ov0, ov1, ov2,         \
                                  ov3, ov4, ov5)                        \
@ -526,6 +534,198 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
  }                                                                     \
 }

+#else
+
+#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
+                                  f_stride, pos, ov0, ov1, ov2,         \
+                                  ov3, ov4, ov5)                        \
+{                                                                       \
+  acc_t *f_scalar = &f_start[0].x;                                      \
+  flt_t *x_scalar = &pos[minlocal].x;                                   \
+  int f_stride4 = f_stride * 4;                                         \
+  _alignvar(acc_t ovv[16],64);                                          \
+  int vwidth;                                                           \
+  if (sizeof(acc_t) == sizeof(double))                                  \
+    vwidth = INTEL_COMPILE_WIDTH/2;                                     \
+  else                                                                  \
+    vwidth = INTEL_COMPILE_WIDTH;                                       \
+  if (vwidth < 4) vwidth = 4;                                           \
+  _use_simd_pragma("omp simd aligned(ovv:64)")                          \
+  for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0;                 \
+  int remainder = lt % vwidth;                                          \
+  if (lf > lt) remainder = 0;                                           \
+  const int v_range = lt - remainder;                                   \
+  if (nthreads == 2) {                                                  \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,ovv,x_scalar:64)")\
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v];                                \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n];                                      \
+  } else if (nthreads==4) {                                             \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
+    acc_t *f_scalar4 = f_scalar3 + f_stride4;                           \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4,ovv:64)") \
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] +              \
+          f_scalar4[n+v];                                               \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];        \
+  } else if (nthreads==1) {                                             \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("omp simd aligned(ovv,f_scalar,x_scalar:64)")    \
+      for (int v = 0; v < vwidth; v++)                                  \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+  } else if (nthreads==3) {                                             \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,ovv,x_scalar:64)") \
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v];               \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n] + f_scalar3[n];                       \
+  }                                                                     \
+  for (int n = v_range; n < lt; n += 4) {                               \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    for (int v = 0; v < 4; v++)                                         \
+      ovv[v] += f_scalar[n+v] * x_scalar[n+v];                          \
+    ov3 += f_scalar[n+1] * x_scalar[n+0];                               \
+    ov4 += f_scalar[n+2] * x_scalar[n+0];                               \
+    ov5 += f_scalar[n+2] * x_scalar[n+1];                               \
+  }                                                                     \
+  ov0 += ovv[0];                                                        \
+  ov1 += ovv[1];                                                        \
+  ov2 += ovv[2];                                                        \
+  if (vwidth > 4) {                                                     \
+    ov0 += ovv[4];                                                      \
+    ov1 += ovv[5];                                                      \
+    ov2 += ovv[6];                                                      \
+  }                                                                     \
+  if (vwidth > 8) {                                                     \
+    ov0 += ovv[8] + ovv[12];                                            \
+    ov1 += ovv[9] + ovv[13];                                            \
+    ov2 += ovv[10] + ovv[14];                                           \
+  }                                                                     \
+}
+
+#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,       \
+                               f_stride, pos, offload, vflag, ov0, ov1, \
+                               ov2, ov3, ov4, ov5)                      \
+{                                                                       \
+  int o_range = (nall - minlocal) * 4;                                  \
+  IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads,       \
+                            sizeof(acc_t));                             \
+                                                                        \
+  acc_t *f_scalar = &f_start[0].x;                                      \
+  int f_stride4 = f_stride * 4;                                         \
+  int t;                                                                \
+  if (vflag == VIRIAL_FDOTR) t = 4; else t = 1;                         \
+  acc_t *f_scalar2 = f_scalar + f_stride4 * t;                          \
+  for ( ; t < nthreads; t++) {                                          \
+    _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2:64)")         \
+    for (int n = iifrom; n < iito; n++)                                 \
+      f_scalar[n] += f_scalar2[n];                                      \
+    f_scalar2 += f_stride4;                                             \
+  }                                                                     \
+                                                                        \
+  if (vflag == VIRIAL_FDOTR) {                                          \
+    int nt_min = MIN(4,nthreads);                                       \
+    IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start,  \
+                              f_stride, pos, ov0, ov1, ov2, ov3, ov4,   \
+                              ov5);                                     \
+  }                                                                     \
+}
+
+#endif
+
 #ifdef _LMP_INTEL_OFFLOAD
 #include <sys/time.h>

--- a/src/INTEL/intel_simd.h
+++ b/src/INTEL/intel_simd.h
@ -173,7 +173,7 @@ namespace ip_simd {
  }

  inline SIMD_double SIMD_gather(const double *p, const SIMD_int &i) {
-    return _mm512_i32logather_pd(i, p, _MM_SCALE_8);
+    return _mm512_i32gather_pd(_mm512_castsi512_si256(i), p, _MM_SCALE_8);
  }

  inline SIMD_int SIMD_gather(const SIMD_mask &m, const int *p,
@ -190,8 +190,8 @@ namespace ip_simd {

  inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p,
                                 const SIMD_int &i) {
-    return _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, p,
-                                      _MM_SCALE_8);
+    return _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), p, _MM_SCALE_8);
  }

  template <typename T>
@ -227,8 +227,8 @@ namespace ip_simd {

  inline SIMD_double SIMD_gatherz(const SIMD_mask &m, const double *p,
                                  const SIMD_int &i) {
-    return _mm512_mask_i32logather_pd( _mm512_set1_pd(0.0), m, i, p,
-                                      _MM_SCALE_8);
+    return _mm512_mask_i32gather_pd( _mm512_set1_pd(0.0), m,
+                                     _mm512_castsi512_si256(i),p, _MM_SCALE_8);
  }

  // ------- Store Operations
@ -257,7 +257,8 @@ namespace ip_simd {

  inline void SIMD_scatter(const SIMD_mask &m, double *p,
                           const SIMD_int &i, const SIMD_double &vec) {
-    _mm512_mask_i32loscatter_pd(p, m, i, vec, _MM_SCALE_8);
+    _mm512_mask_i32scatter_pd(p, m, _mm512_castsi512_si256(i), vec,
+                              _MM_SCALE_8);
  }

  // ------- Arithmetic Operations
@ -834,22 +835,28 @@ namespace ip_simd {
  inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
                               const SIMD_int &i, SIMD_double &x,
                               SIMD_double &y, SIMD_double &z) {
-    x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
+    x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                 _mm512_castsi512_si256(i), atom,
                                 _MM_SCALE_2);
-    y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
+    y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                 _mm512_castsi512_si256(i), atom+1,
                                 _MM_SCALE_2);
-    z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
+    z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                 _mm512_castsi512_si256(i), atom+2,
                                 _MM_SCALE_2);
  }

  inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
                               const SIMD_int &i, SIMD_double &x,
                               SIMD_double &y, SIMD_double &z, SIMD_int &type) {
-    x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
+    x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                 _mm512_castsi512_si256(i), atom,
                                 _MM_SCALE_2);
-    y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
+    y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                 _mm512_castsi512_si256(i), atom+1,
                                 _MM_SCALE_2);
-    z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
+    z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                 _mm512_castsi512_si256(i), atom+2,
                                 _MM_SCALE_2);
    type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3,
                                       _MM_SCALE_2);
@ -888,10 +895,12 @@ namespace ip_simd {
                               const SIMD_int &joffset, SIMD_double &eng) {
    SIMD_double jeng;
    SIMD_conflict_pi_reduce1(rmask, joffset, eng);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+    jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
                                    force, _MM_SCALE_2);
    jeng = jeng + eng;
-    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
+                              jeng, _MM_SCALE_2);
  }

  inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force,
@ -899,20 +908,24 @@ namespace ip_simd {
    SIMD_double engd, jeng;
    engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng));
    SIMD_conflict_pi_reduce1(rmask, joffset, engd);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+    jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
                                    force, _MM_SCALE_2);
    jeng = jeng + engd;
-    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
+                              jeng, _MM_SCALE_2);

    SIMD_mask rmask2 = rmask >> 8;
    engd = _mm512_cvtps_pd(_mm512_castps512_ps256(
                             _mm512_shuffle_f32x4(eng,eng,238)));
    SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
    SIMD_conflict_pi_reduce1(rmask2, joffset2, engd);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
+    jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, 
+                                    _mm512_castsi512_si256(joffset2),
                                    force, _MM_SCALE_2);
    jeng = jeng + engd;
-    _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jeng, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2),
+                              jeng, _MM_SCALE_2);
  }

  inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force,
@ -926,10 +939,12 @@ namespace ip_simd {

    SIMD_double jeng;
    SIMD_conflict_pi_reduce1(rmask, joffset, eng);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+    jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
                                    force, _MM_SCALE_2);
    jeng = jeng + eng;
-    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
+                              jeng, _MM_SCALE_2);
  }

  inline void SIMD_safe_jforce(const SIMD_mask &m, float *force,
@ -956,18 +971,24 @@ namespace ip_simd {
                               SIMD_double &fy, SIMD_double &fz) {
    SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
    SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                    _mm512_castsi512_si256(i), force,
                                    _MM_SCALE_2);
    jfrc = jfrc + fx;
-    _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
+    _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force + 1,
                                    _MM_SCALE_2);
    jfrc = jfrc + fy;
-    _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
+    _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                    _mm512_castsi512_si256(i), force + 2,
                                    _MM_SCALE_2);
    jfrc = jfrc + fz;
-    _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
  }

  inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force,
@ -979,18 +1000,24 @@ namespace ip_simd {
    amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz));
    SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd);
    SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
                                    force, _MM_SCALE_2);
    jfrc = jfrc + amxd;
-    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+    _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
+                              jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
                                    force + 1, _MM_SCALE_2);
    jfrc = jfrc + amyd;
-    _mm512_mask_i32loscatter_pd(force+1, rmask, joffset, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+    _mm512_mask_i32scatter_pd(force+1, rmask, _mm512_castsi512_si256(joffset),
+                              jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
+                                    _mm512_castsi512_si256(joffset),
                                    force + 2, _MM_SCALE_2);
    jfrc = jfrc + amzd;
-    _mm512_mask_i32loscatter_pd(force+2, rmask, joffset, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, rmask, _mm512_castsi512_si256(joffset),
+                              jfrc, _MM_SCALE_2);

    SIMD_mask rmask2 = rmask >> 8;
    amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
@ -1001,18 +1028,26 @@ namespace ip_simd {
                                                  _mm512_shuffle_f32x4(amz,amz,238)));
    SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
    SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, 
+                                    _mm512_castsi512_si256(joffset2),
                                    force, _MM_SCALE_2);
    jfrc = jfrc + amxd;
-    _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
+    _mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2),
+                              jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, 
+                                    _mm512_castsi512_si256(joffset2),
                                    force + 1, _MM_SCALE_2);
    jfrc = jfrc + amyd;
-    _mm512_mask_i32loscatter_pd(force+1, rmask2, joffset2, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
+    _mm512_mask_i32scatter_pd(force+1, rmask2,
+                              _mm512_castsi512_si256(joffset2), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
+                                    _mm512_castsi512_si256(joffset2),
                                    force + 2, _MM_SCALE_2);
    jfrc = jfrc + amzd;
-    _mm512_mask_i32loscatter_pd(force+2, rmask2, joffset2, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, rmask2, 
+                              _mm512_castsi512_si256(joffset2), jfrc,
+                              _MM_SCALE_2);
  }

  inline void SIMD_jforce_update(const SIMD_mask &m, float *force,
@ -1064,18 +1099,24 @@ namespace ip_simd {
                                 const SIMD_int &i, const SIMD_double &fx,
                                 const SIMD_double &fy, const SIMD_double &fz)   {
    SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                    _mm512_castsi512_si256(i), force,
                                    _MM_SCALE_2);
    jfrc = jfrc - fx;
-    _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
+    _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force + 1,
                                    _MM_SCALE_2);
    jfrc = jfrc - fy;
-    _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
+    _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force + 2,
                                    _MM_SCALE_2);
    jfrc = jfrc - fz;
-    _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
  }

  inline void SIMD_jforce_update(const SIMD_mask &rmask,
@ -1502,11 +1543,12 @@ namespace ip_simd {
      fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
      fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
      SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
-                                                    hmask, k, force + 3,
-                                                    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask,
+                                                  _mm512_castsi512_si256(k),
+                                                  force + 3, _MM_SCALE_2);
      keng = keng + hevdwl;
-      _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
+      _mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k),
+                                keng, _MM_SCALE_2);
    }
  }

@ -1523,11 +1565,12 @@ namespace ip_simd {
      fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
      fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
      SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
-                                                    hmask, k, force + 3,
-                                                    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask,
+                                                  _mm512_castsi512_si256(k),
+                                                  force + 3, _MM_SCALE_2);
      keng = keng + hevdwl;
-      _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
+      _mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k),
+                                keng, _MM_SCALE_2);
    }
    SIMD_mask hmask2 = hmask >> 8;
    facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(
@ -1539,11 +1582,13 @@ namespace ip_simd {
      fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl);
      SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238);
      SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
-                                                    hmask2, k2, force + 3,
-                                                    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(),
+                                                  hmask2,
+                                                  _mm512_castsi512_si256(k2),
+                                                  force + 3, _MM_SCALE_2);
      keng = keng + hevdwl;
-      _mm512_mask_i32loscatter_pd(force + 3, hmask2, k2, keng, _MM_SCALE_2);
+      _mm512_mask_i32scatter_pd(force + 3, hmask2, _mm512_castsi512_si256(k2),
+                                keng, _MM_SCALE_2);
    }
  }

@ -1815,24 +1860,32 @@ namespace ip_simd {
                                 const int EFLAG, const int eatom,
                                 const SIMD_double &fwtmp) {
    SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force,
                                    _MM_SCALE_2);
    jfrc = jfrc + fx;
-    _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
+    _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                    _mm512_castsi512_si256(i), force + 1,
                                    _MM_SCALE_2);
    jfrc = jfrc + fy;
-    _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
+    _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force + 2,
                                    _MM_SCALE_2);
    jfrc = jfrc + fz;
-    _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
    if (EFLAG) {
      if (eatom) {
-        jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i,
+        jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                        _mm512_castsi512_si256(i),
                                        force + 3, _MM_SCALE_2);
        jfrc = jfrc + fwtmp;
-        _mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2);
+        _mm512_mask_i32scatter_pd(force+3, m, _mm512_castsi512_si256(i), jfrc,
+                                  _MM_SCALE_2);
      }
    }
  }
--- a/src/INTEL/npair_full_bin_ghost_intel.cpp
+++ b/src/INTEL/npair_full_bin_ghost_intel.cpp
@ -324,7 +324,11 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
              const int bstart = binhead[ibin + binstart[k]];
              const int bend = binhead[ibin + binend[k]];
              #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+              #pragma omp simd
+#else
              #pragma simd
+#endif
              #endif
              for (int jj = bstart; jj < bend; jj++)
                tj[ncount++] = binpacked[jj];
@ -345,15 +349,23 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
              const int bstart = binhead[ibin + stencil[k]];
              const int bend = binhead[ibin + stencil[k] + 1];
              #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+              #pragma omp simd
+#else
              #pragma simd
+#endif
              #endif
              for (int jj = bstart; jj < bend; jj++)
                tj[ncount++] = binpacked[jj];
            }
          } // if i < nlocal
          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma vector aligned
          #endif
          for (int u = 0; u < ncount; u++) {
            const int j = tj[u];
@ -425,12 +437,16 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
          int alln = n;
          n = 0;
          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
          #ifdef LMP_INTEL_NBOR_COMPAT
          #pragma ivdep
          #else
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
 #endif
+          #endif
+          #pragma vector aligned
          #endif
          for (int u = 0; u < alln; u++) {
            int which;
@ -454,12 +470,16 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
          alln = n2;
          n2 = maxnbors * 2;
          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
          #ifdef LMP_INTEL_NBOR_COMPAT
          #pragma ivdep
          #else
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
 #endif
+          #endif
+          #pragma vector aligned
          #endif
          for (int u = n2; u < alln; u++) {
            int which;
--- a/src/INTEL/npair_intel.cpp
+++ b/src/INTEL/npair_intel.cpp
@ -344,14 +344,22 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
            const int bstart = binhead[ibin + binstart[k]];
            const int bend = binhead[ibin + binend[k]];
            #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
            #pragma simd
+#endif
            #endif
            for (int jj = bstart; jj < bend; jj++)
              tj[ncount++] = binpacked[jj];
          }
          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma vector aligned
          #endif
          for (int u = 0; u < ncount; u++) {
            const int j = tj[u];
@ -375,7 +383,11 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
            const int bstart = binhead[ibin];
            const int bend = binhead[ibin + 1];
            #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
            #pragma simd
+#endif
            #endif
            for (int jj = bstart; jj < bend; jj++) {
              const int j = binpacked[jj];
@ -533,12 +545,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,

          n = pack_offset;
          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
          #ifdef LMP_INTEL_NBOR_COMPAT
          #pragma ivdep
          #else
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
 #endif
+          #endif
+          #pragma vector aligned
          #endif
          for (int u = n; u < alln; u++) {
            int which;
@ -566,12 +582,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
            n2 = pack_offset + maxnbors;

            #if defined(LMP_SIMD_COMPILER)
-            #pragma vector aligned
            #ifdef LMP_INTEL_NBOR_COMPAT
            #pragma ivdep
            #else
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
            #pragma simd
 #endif
+            #endif
+            #pragma vector aligned
            #endif
            for (int u = n2; u < alln; u++) {
              int which;
@ -737,8 +757,14 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
          int jnum = numneigh[i];
          if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
          #if __INTEL_COMPILER+0 > 1499
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd reduction(max:vlmax,vgmax) \
+            reduction(min:vlmin, vgmin)
+#else
+          #pragma simd reduction(max:vlmax,vgmax) \
+            reduction(min:vlmin, vgmin)
+#endif
          #pragma vector aligned
-          #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
          #endif
          for (int jj = 0; jj < jnum; jj++) {
            const int j = jlist[jj] & NEIGHMASK;
@ -782,8 +808,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
          int jnum = numneigh[i];
          if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
          int jj = 0;
-          #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma vector aligned
          for (jj = 0; jj < jnum; jj++) {
            const int which = jlist[jj] >> SBBITS & 3;
            const int j = jlist[jj] & NEIGHMASK;
--- a/src/INTEL/pair_buck_coul_cut_intel.cpp
+++ b/src/INTEL/pair_buck_coul_cut_intel.cpp
@ -248,12 +248,18 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
        fxtmp = fytmp = fztmp = (acc_t)0;
        if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
        if (NEWTON_PAIR == 0)
-          if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+          if (vflag == VIRIAL_PAIR)
+            sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;

        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)
+#else
        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                               sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
+        #pragma vector aligned
        #endif
        for (int jj = 0; jj < jnum; jj++) {
          flt_t forcecoul, forcebuck, evdwl, ecoul;
--- a/src/INTEL/pair_buck_coul_long_intel.cpp
+++ b/src/INTEL/pair_buck_coul_long_intel.cpp
@ -309,9 +309,14 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
        }

        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
        #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-                                 sv0, sv1, sv2, sv3, sv4, sv5)
        #endif
        for (int jj = 0; jj < ej; jj++) {
          flt_t forcecoul, forcebuck, evdwl, ecoul;
--- a/src/INTEL/pair_buck_intel.cpp
+++ b/src/INTEL/pair_buck_intel.cpp
@ -230,12 +230,18 @@ void PairBuckIntel::eval(const int offload, const int vflag,
        fxtmp = fytmp = fztmp = (acc_t)0;
        if (EFLAG) fwtmp = sevdwl =  (acc_t)0;
        if (NEWTON_PAIR == 0)
-          if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+          if (vflag == VIRIAL_PAIR) 
+            sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;

        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)
+#else
        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                               sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
+        #pragma vector aligned
        #endif
        for (int jj = 0; jj < jnum; jj++) {

--- a/src/INTEL/pair_dpd_intel.cpp
+++ b/src/INTEL/pair_dpd_intel.cpp
@ -289,9 +289,14 @@ void PairDPDIntel::eval(const int offload, const int vflag,
        }

        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)
+#else
        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                               sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
+        #pragma vector aligned
        #endif
        for (int jj = 0; jj < jnum; jj++) {
          flt_t forcelj, evdwl;
--- a/src/INTEL/pair_eam_intel.cpp
+++ b/src/INTEL/pair_eam_intel.cpp
@ -327,8 +327,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
        }

        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:rhoi)
+#else
        #pragma simd reduction(+:rhoi)
+#endif
+        #pragma vector aligned
        #endif
        for (int jj = 0; jj < ej; jj++) {
          int jtype;
@ -369,23 +373,35 @@ void PairEAMIntel::eval(const int offload, const int vflag,
          const int rcount = nall;
          if (nthreads == 2) {
            double *trho2 = rho + nmax;
-            #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
            #pragma simd
+#endif
+            #pragma vector aligned
            for (int n = 0; n < rcount; n++)
              rho[n] += trho2[n];
          } else if (nthreads == 4) {
            double *trho2 = rho + nmax;
            double *trho3 = trho2 + nmax;
            double *trho4 = trho3 + nmax;
-            #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
            #pragma simd
+#endif
+            #pragma vector aligned
            for (int n = 0; n < rcount; n++)
              rho[n] += trho2[n] + trho3[n] + trho4[n];
          } else {
            double *trhon = rho + nmax;
            for (int t = 1; t < nthreads; t++) {
-              #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+              #pragma omp simd
+#else
              #pragma simd
+#endif
+              #pragma vector aligned
              for (int n = 0; n < rcount; n++)
                rho[n] += trhon[n];
              trhon += nmax;
@ -414,8 +430,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
      if (EFLAG) tevdwl = (acc_t)0.0;

      #if defined(LMP_SIMD_COMPILER)
-      #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd reduction(+:tevdwl)
+#else
      #pragma simd reduction(+:tevdwl)
+#endif
+      #pragma vector aligned
      #endif
      for (int ii = iifrom; ii < iito; ++ii) {
        const int i = ilist[ii];
@ -510,9 +530,14 @@ void PairEAMIntel::eval(const int offload, const int vflag,
        }

        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)
+#else
        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                               sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
+        #pragma vector aligned
        #endif
        for (int jj = 0; jj < ej; jj++) {
          int jtype;
--- a/src/INTEL/pair_gayberne_intel.cpp
+++ b/src/INTEL/pair_gayberne_intel.cpp
@ -449,9 +449,14 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
        __assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
        #endif
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp, \
+                                   t3tmp,sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
+#else
+        #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp, \
+                               t3tmp,sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
+#endif
        #pragma vector aligned
-        #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
-                                 sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
        #endif
        for (int jj = 0; jj < packed_j; jj++) {
          flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
@ -806,8 +811,12 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
        acc_t *f_scalar2 = f_scalar + fst4;
        for (int t = 1; t < nthreads; t++) {
          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma vector aligned
          #endif
          for (int n = iifrom * 8; n < sto; n++)
            f_scalar[n] += f_scalar2[n];
--- a/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
+++ b/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
@ -294,9 +294,14 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
        }

        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
        #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-                               sv0, sv1, sv2, sv3, sv4, sv5)
        #endif
        for (int jj = 0; jj < ej; jj++) {
          flt_t forcecoul, forcelj, evdwl;
--- a/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
@ -314,9 +314,14 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
        }

        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
        #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-                               sv0, sv1, sv2, sv3, sv4, sv5)
        #endif
        for (int jj = 0; jj < ej; jj++) {
          flt_t forcecoul, forcelj, evdwl, ecoul;
--- a/src/INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/INTEL/pair_lj_cut_coul_long_intel.cpp
@ -305,9 +305,14 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
        }

        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
        #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-                                 sv0, sv1, sv2, sv3, sv4, sv5)
        #endif
        for (int jj = 0; jj < ej; jj++) {
          flt_t forcecoul, forcelj, evdwl, ecoul;
--- a/src/INTEL/pair_lj_cut_intel.cpp
+++ b/src/INTEL/pair_lj_cut_intel.cpp
@ -241,9 +241,15 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
          if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;

        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)         \
+          aligned(jlist,x,ljc12oi,special_lj,f,lj34i:64)
+#else
        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                               sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma vector aligned
+#endif
        #endif
        for (int jj = 0; jj < jnum; jj++) {
          flt_t forcelj, evdwl;
--- a/src/INTEL/pair_sw_intel.cpp
+++ b/src/INTEL/pair_sw_intel.cpp
@ -371,8 +371,12 @@ void PairSWIntel::eval(const int offload, const int vflag,
        }

        #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
+#else
        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
+#endif
+        #pragma vector aligned
        #endif
        for (int jj = 0; jj < ejnum_pad; jj++) {
          acc_t fjxtmp, fjytmp, fjztmp, fjtmp;
--- a/src/INTEL/pppm_disp_intel.cpp
+++ b/src/INTEL/pppm_disp_intel.cpp
@ -770,8 +770,12 @@ void PPPMDispIntel::particle_map(double delx, double dely, double delz,
    IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));

    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:flag)
+#else
    #pragma simd reduction(+:flag)
+#endif
+    #pragma vector aligned
    #endif
    for (int i = iifrom; i < iito; i++) {

@ -876,7 +880,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho_lookup[idx][k];
@ -885,7 +893,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower; k <= nupper; k++) {
          FFT_SCALAR r1,r2,r3;
@ -917,8 +929,12 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          int mzy = m*nix + mz;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mzyx = l + mzy;
@ -939,7 +955,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);

    #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
    #endif
    for (int i = ifrom; i < ito; i++) {
      for (int j = 1; j < nthr; j++) {
@ -1025,7 +1045,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho6_lookup[idx][k];
@ -1034,7 +1058,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1,r2,r3;
@ -1067,8 +1095,12 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          int mzy = m*nix + mz;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mzyx = l + mzy;
@ -1089,7 +1121,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr);

    #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
    #endif
    for (int i = ifrom; i < ito; i++) {
      for (int j = 1; j < nthr; j++) {
@ -1173,7 +1209,11 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho6_lookup[idx][k];
@ -1182,7 +1222,11 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1,r2,r3;
@ -1215,8 +1259,12 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          int my = m + nysum;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mx = l + nxsum;
@ -1307,7 +1355,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho6_lookup[idx][k];
@ -1316,7 +1368,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1,r2,r3;
@ -1349,8 +1405,12 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          int mzy = m*nix + mz;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mzyx = l + mzy;
@ -1373,7 +1433,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr);

    #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
    #endif
    for (int i = ifrom; i < ito; i++) {
      for (int j = 1; j < nthr; j++) {
@ -1454,7 +1518,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho0[k] = rho_lookup[idx][k];
@ -1463,7 +1531,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower; k <= nupper; k++) {
          FFT_SCALAR r1 = rho_coeff[order-1][k];
@ -1498,8 +1570,12 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          int my = m+nysum;
          FFT_SCALAR y0 = z0*rho1[m];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mx = l+nxsum;
@ -1624,7 +1700,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        int idz = dz;

        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho_lookup[idx][k];
@ -1636,7 +1716,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower; k <= nupper; k++) {
          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@ -1680,8 +1764,12 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mx = l + nxsum;
@ -1702,7 +1790,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
      }
    }
    #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
    #endif
    for (int i = ifrom; i < ito; i++) {
      particle_ekx[i] *= hx_inv;
@ -1802,7 +1894,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho0[k] = rho6_lookup[idx][k];
@ -1811,7 +1907,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
@ -1846,8 +1946,12 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          int my = m+nysum;
          FFT_SCALAR y0 = z0*rho1[m];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mx = l+nxsum;
@ -1967,7 +2071,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        int idz = dz;

        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho6_lookup[idx][k];
@ -1979,7 +2087,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@ -2023,8 +2135,12 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mx = l + nxsum;
@ -2045,7 +2161,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
      }
    }
    #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
    #endif
    for (int i = ifrom; i < ito; i++) {
      particle_ekx[i] *= hx_inv;
@ -2143,7 +2263,11 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho0[k] = rho6_lookup[idx][k];
@ -2152,7 +2276,11 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
@ -2206,8 +2334,12 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          int my = m+nysum;
          FFT_SCALAR y0 = z0*rho1[m];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mx = l+nxsum;
@ -2398,7 +2530,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        int idz = dz;

        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho6_lookup[idx][k];
@ -2410,7 +2546,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@ -2479,8 +2619,12 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int l = 0; l < order; l++) {
            int mx = l + nxsum;
@ -2541,7 +2685,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
      }
    }
    #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
    #endif
    for (int i = ifrom; i < ito; i++) {
      particle_ekx0[i] *= hx_inv;
@ -2671,7 +2819,11 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho0[k] = rho6_lookup[idx][k];
@ -2680,7 +2832,11 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
@ -2721,8 +2877,12 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
            int my = m+nysum;
            FFT_SCALAR y0 = z0*rho1[m];
            #if defined(LMP_SIMD_COMPILER)
-            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
            #pragma simd
+#endif
+            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
            #endif
            for (int l = 0; l < order; l++) {
              int mx = l+nxsum;
@ -2848,7 +3008,11 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        int idz = dz;

        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho6_lookup[idx][k];
@ -2860,7 +3024,11 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower_6; k <= nupper_6; k++) {
          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@ -2909,8 +3077,12 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
            FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
            FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
            #if defined(LMP_SIMD_COMPILER)
-            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
            #pragma simd
+#endif
+            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
            #endif
            for (int l = 0; l < order; l++) {
              int mx = l + nxsum;
@ -2992,7 +3164,11 @@ void PPPMDispIntel::precompute_rho()
    for (int i = 0; i < rho_points; i++) {
      FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
      #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd
+#else
      #pragma simd
+#endif
      #endif
      for (int k=nlower; k<=nupper;k++) {
        FFT_SCALAR r1 = ZEROF;
@ -3006,7 +3182,11 @@ void PPPMDispIntel::precompute_rho()
      }
      if (differentiation_flag == 1) {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k=nlower; k<=nupper;k++) {
          FFT_SCALAR r1 = ZEROF;
@ -3026,7 +3206,11 @@ void PPPMDispIntel::precompute_rho()
    for (int i = 0; i < rho_points; i++) {
      FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
      #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd
+#else
      #pragma simd
+#endif
      #endif
      for (int k=nlower_6; k<=nupper_6;k++) {
        FFT_SCALAR r1 = ZEROF;
@ -3040,7 +3224,11 @@ void PPPMDispIntel::precompute_rho()
      }
      if (differentiation_flag == 1) {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k=nlower_6; k<=nupper_6;k++) {
          FFT_SCALAR r1 = ZEROF;
--- a/src/INTEL/pppm_intel.cpp
+++ b/src/INTEL/pppm_intel.cpp
@ -394,8 +394,12 @@ void PPPMIntel::particle_map(IntelBuffers<flt_t,acc_t> *buffers)
    IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));

    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:flag)
+#else
    #pragma simd reduction(+:flag)
+#endif
+    #pragma vector aligned
    #endif
    for (int i = iifrom; i < iito; i++) {

@ -500,7 +504,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho_lookup[idx][k];
@ -509,7 +517,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower; k <= nupper; k++) {
          FFT_SCALAR r1,r2,r3;
@ -541,7 +553,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
          int mzy = m*nix + mz;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
          #endif
          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
            int mzyx = l + mzy;
@ -563,7 +579,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
      IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);

      #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd
+#else
      #pragma simd
+#endif
      #endif
      for (int i = ifrom; i < ito; i++) {
        for (int j = 1; j < nthr; j++) {
@ -645,7 +665,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho0[k] = rho_lookup[idx][k];
@ -654,7 +678,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower; k <= nupper; k++) {
          FFT_SCALAR r1 = rho_coeff[order-1][k];
@ -690,7 +718,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
          int my = m+nysum;
          FFT_SCALAR y0 = z0*rho1[m];
          #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
          #endif
          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
            int mx = l+nxsum;
@ -813,7 +845,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
        dz = dz*half_rho_scale + half_rho_scale_plus;
        int idz = dz;
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
          rho[0][k] = rho_lookup[idx][k];
@ -825,7 +861,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
        #pragma simd
+#endif
        #endif
        for (int k = nlower; k <= nupper; k++) {
          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@ -871,7 +911,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
          #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
          #pragma simd
+#endif
          #endif
          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
            int mx = l + nxsum;
@ -893,7 +937,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
    }

    #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
    #endif
    for (int i = ifrom; i < ito; i++) {
      particle_ekx[i] *= hx_inv;
@ -942,7 +990,11 @@ void PPPMIntel::precompute_rho()
  for (int i = 0; i < rho_points; i++) {
    FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
    #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
    #pragma simd
+#endif
    #endif
    for (int k=nlower; k<=nupper;k++) {
      FFT_SCALAR r1 = ZEROF;
@ -956,7 +1008,11 @@ void PPPMIntel::precompute_rho()
    }
    if (differentiation_flag == 1) {
      #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd
+#else
      #pragma simd
+#endif
      #endif
      for (int k=nlower; k<=nupper;k++) {
        FFT_SCALAR r1 = ZEROF;