From fefcd0e2af255056a0dc49a1604e02b9fa92a8c8 Mon Sep 17 00:00:00 2001
From: Mike Brown <michael.w.brown@intel.com>
Date: Mon, 26 Jul 2021 11:22:21 -0700
Subject: [PATCH] Converting cilk vectorization directives to the openmp
 standard and changing some more depracated vector intrinsics. Data alignment
 directives for compiler vectorization are still mostly intel specific.

---
 src/INTEL/angle_charmm_intel.cpp              |   8 +
 src/INTEL/angle_harmonic_intel.cpp            |   8 +
 src/INTEL/bond_fene_intel.cpp                 |   8 +
 src/INTEL/bond_harmonic_intel.cpp             |   8 +
 src/INTEL/dihedral_charmm_intel.cpp           |  19 +-
 src/INTEL/dihedral_fourier_intel.cpp          |   8 +
 src/INTEL/dihedral_harmonic_intel.cpp         |   8 +
 src/INTEL/dihedral_opls_intel.cpp             |   8 +
 src/INTEL/fix_intel.cpp                       |  32 ++-
 src/INTEL/fix_nh_intel.cpp                    |  66 ++++-
 src/INTEL/fix_nve_asphere_intel.cpp           |  24 +-
 src/INTEL/fix_nve_intel.cpp                   |  36 ++-
 src/INTEL/improper_cvff_intel.cpp             |  12 +
 src/INTEL/improper_harmonic_intel.cpp         |   8 +
 src/INTEL/intel_intrinsics.h                  |   3 +-
 src/INTEL/intel_intrinsics_airebo.h           |  36 +--
 src/INTEL/intel_preprocess.h                  | 200 +++++++++++++++
 src/INTEL/intel_simd.h                        | 239 +++++++++++-------
 src/INTEL/npair_full_bin_ghost_intel.cpp      |  26 +-
 src/INTEL/npair_intel.cpp                     |  40 ++-
 src/INTEL/pair_buck_coul_cut_intel.cpp        |  10 +-
 src/INTEL/pair_buck_coul_long_intel.cpp       |   9 +-
 src/INTEL/pair_buck_intel.cpp                 |  10 +-
 src/INTEL/pair_dpd_intel.cpp                  |   9 +-
 src/INTEL/pair_eam_intel.cpp                  |  39 ++-
 src/INTEL/pair_gayberne_intel.cpp             |  15 +-
 .../pair_lj_charmm_coul_charmm_intel.cpp      |   9 +-
 src/INTEL/pair_lj_charmm_coul_long_intel.cpp  |   9 +-
 src/INTEL/pair_lj_cut_coul_long_intel.cpp     |   9 +-
 src/INTEL/pair_lj_cut_intel.cpp               |   8 +-
 src/INTEL/pair_sw_intel.cpp                   |   6 +-
 src/INTEL/pppm_disp_intel.cpp                 | 214 +++++++++++++++-
 src/INTEL/pppm_intel.cpp                      |  58 ++++-
 33 files changed, 1013 insertions(+), 189 deletions(-)

diff --git a/src/INTEL/angle_charmm_intel.cpp b/src/INTEL/angle_charmm_intel.cpp
index 29b7ec208b..26943934be 100644
--- a/src/INTEL/angle_charmm_intel.cpp
+++ b/src/INTEL/angle_charmm_intel.cpp
@@ -162,7 +162,11 @@ void AngleCharmmIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -246,7 +250,11 @@ void AngleCharmmIntel::eval(const int vflag,
       // apply force to each of 3 atoms
 
       #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/angle_harmonic_intel.cpp b/src/INTEL/angle_harmonic_intel.cpp
index a2d8cc7d13..e392730edc 100644
--- a/src/INTEL/angle_harmonic_intel.cpp
+++ b/src/INTEL/angle_harmonic_intel.cpp
@@ -162,7 +162,11 @@ void AngleHarmonicIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -228,7 +232,11 @@ void AngleHarmonicIntel::eval(const int vflag,
       // apply force to each of 3 atoms
 
       #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/bond_fene_intel.cpp b/src/INTEL/bond_fene_intel.cpp
index 44a8c0d3cf..1ab8da68d9 100644
--- a/src/INTEL/bond_fene_intel.cpp
+++ b/src/INTEL/bond_fene_intel.cpp
@@ -158,7 +158,11 @@ void BondFENEIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -215,7 +219,11 @@ void BondFENEIntel::eval(const int vflag,
       // apply force to each of 2 atoms
 
       #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/bond_harmonic_intel.cpp b/src/INTEL/bond_harmonic_intel.cpp
index a37ae091a0..35b194f0fa 100644
--- a/src/INTEL/bond_harmonic_intel.cpp
+++ b/src/INTEL/bond_harmonic_intel.cpp
@@ -155,7 +155,11 @@ void BondHarmonicIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -184,7 +188,11 @@ void BondHarmonicIntel::eval(const int vflag,
 
       // apply force to each of 2 atoms
       #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/dihedral_charmm_intel.cpp b/src/INTEL/dihedral_charmm_intel.cpp
index a317be00fb..4116d9134f 100644
--- a/src/INTEL/dihedral_charmm_intel.cpp
+++ b/src/INTEL/dihedral_charmm_intel.cpp
@@ -181,9 +181,16 @@ void DihedralCharmmIntel::eval(const int vflag,
     }
 
     #if defined(LMP_SIMD_COMPILER_TEST)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
+                               sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, \
+                               spv5)
+#else
     #pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
-                           sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5)
+                           sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, \
+                           spv5)
+#endif
+    #pragma vector aligned
     for (int n = nfrom; n < nto; n++) {
     #endif
     for (int n = nfrom; n < nto; n += npl) {
@@ -329,7 +336,11 @@ void DihedralCharmmIntel::eval(const int vflag,
 
 
       #if defined(LMP_SIMD_COMPILER_TEST)
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i2 < nlocal) {
@@ -408,7 +419,11 @@ void DihedralCharmmIntel::eval(const int vflag,
 
       // apply force to each of 4 atoms
       #if defined(LMP_SIMD_COMPILER_TEST)
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/dihedral_fourier_intel.cpp b/src/INTEL/dihedral_fourier_intel.cpp
index 4d44ea36d2..d952ac7506 100644
--- a/src/INTEL/dihedral_fourier_intel.cpp
+++ b/src/INTEL/dihedral_fourier_intel.cpp
@@ -154,7 +154,11 @@ void DihedralFourierIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -304,7 +308,11 @@ void DihedralFourierIntel::eval(const int vflag,
       }
 
       #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/dihedral_harmonic_intel.cpp b/src/INTEL/dihedral_harmonic_intel.cpp
index f7009689c7..df9304b6ba 100644
--- a/src/INTEL/dihedral_harmonic_intel.cpp
+++ b/src/INTEL/dihedral_harmonic_intel.cpp
@@ -154,7 +154,11 @@ void DihedralHarmonicIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -299,7 +303,11 @@ void DihedralHarmonicIntel::eval(const int vflag,
       }
 
       #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/dihedral_opls_intel.cpp b/src/INTEL/dihedral_opls_intel.cpp
index ab007dad8c..89f06773d5 100644
--- a/src/INTEL/dihedral_opls_intel.cpp
+++ b/src/INTEL/dihedral_opls_intel.cpp
@@ -158,7 +158,11 @@ void DihedralOPLSIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -319,7 +323,11 @@ void DihedralOPLSIntel::eval(const int vflag,
       }
 
       #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/fix_intel.cpp b/src/INTEL/fix_intel.cpp
index 59eea4961a..d0633d7791 100644
--- a/src/INTEL/fix_intel.cpp
+++ b/src/INTEL/fix_intel.cpp
@@ -635,19 +635,31 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
     if (_nthreads == 4) {
       acc_t *f_scalar3 = f_scalar2 + f_stride4;
       acc_t *f_scalar4 = f_scalar3 + f_stride4;
-      _use_simd_pragma("vector aligned")
-      _use_simd_pragma("simd")
+      #if defined(USE_OMP_SIMD)
+      #pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4:64)
+      #elif defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma simd
+      #endif
       for (int n = 0; n < o_range; n++)
         f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
     } else if (_nthreads == 2) {
-      _use_simd_pragma("vector aligned")
-      _use_simd_pragma("simd")
+      #if defined(USE_OMP_SIMD)
+      #pragma omp simd aligned(f_scalar,f_scalar2:64)
+      #elif defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma simd
+      #endif
       for (int n = 0; n < o_range; n++)
         f_scalar[n] += f_scalar2[n];
     } else {
       acc_t *f_scalar3 = f_scalar2 + f_stride4;
-      _use_simd_pragma("vector aligned")
-      _use_simd_pragma("simd")
+      #if defined(USE_OMP_SIMD)
+      #pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3:64)
+      #elif defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma simd
+      #endif
       for (int n = 0; n < o_range; n++)
         f_scalar[n] += f_scalar2[n] + f_scalar3[n];
     }
@@ -662,8 +674,12 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
 
       acc_t *f_scalar2 = f_scalar + f_stride4;
       for (int t = 1; t < _nthreads; t++) {
-        _use_simd_pragma("vector aligned")
-        _use_simd_pragma("simd")
+        #if defined(USE_OMP_SIMD)
+        #pragma omp simd aligned(f_scalar,f_scalar2:64)
+        #elif defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma simd
+        #endif
         for (int n = iifrom; n < iito; n++)
           f_scalar[n] += f_scalar2[n];
         f_scalar2 += f_stride4;
diff --git a/src/INTEL/fix_nh_intel.cpp b/src/INTEL/fix_nh_intel.cpp
index 5370e3a13f..a4fdecbd96 100644
--- a/src/INTEL/fix_nh_intel.cpp
+++ b/src/INTEL/fix_nh_intel.cpp
@@ -99,8 +99,12 @@ void FixNHIntel::remap()
 
   if (allremap) {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < nlocal; i++) {
       const double d0 = x[i].x - b0;
@@ -112,8 +116,12 @@ void FixNHIntel::remap()
     }
   } else {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & dilate_group_bit) {
@@ -278,8 +286,12 @@ void FixNHIntel::remap()
 
   if (allremap) {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < nlocal; i++) {
       x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
@@ -288,8 +300,12 @@ void FixNHIntel::remap()
     }
   } else {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & dilate_group_bit) {
@@ -415,8 +431,12 @@ void FixNHIntel::nh_v_press()
 
   if (igroup == 0) {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < nlocal; i++) {
       v[i].x *= f0;
@@ -425,8 +445,12 @@ void FixNHIntel::nh_v_press()
     }
   } else {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & groupbit) {
@@ -448,8 +472,12 @@ void FixNHIntel::nve_v()
   double * _noalias const v = atom->v[0];
   const double * _noalias const f = atom->f[0];
   #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+  #pragma omp simd
+#else
   #pragma simd
+#endif
+  #pragma vector aligned
   #endif
   for (int i = 0; i < _nlocal3; i++)
     v[i] += _dtfm[i] * f[i];
@@ -468,15 +496,23 @@ void FixNHIntel::nve_x()
 
   if (igroup == 0) {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++)
       x[i] += dtv * v[i];
   } else {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++) {
       if (_dtfm[i] != 0.0)
@@ -500,15 +536,23 @@ void FixNHIntel::nh_v_temp()
 
   if (igroup == 0) {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++)
         v[i] *= factor_eta;
   } else {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++) {
       if (_dtfm[i] != 0.0)
diff --git a/src/INTEL/fix_nve_asphere_intel.cpp b/src/INTEL/fix_nve_asphere_intel.cpp
index 78504c237a..eda8b48a67 100644
--- a/src/INTEL/fix_nve_asphere_intel.cpp
+++ b/src/INTEL/fix_nve_asphere_intel.cpp
@@ -97,8 +97,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
   dtq = 0.5 * dtv;
 
   #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+  #pragma omp simd
+#else
   #pragma simd
+#endif
+  #pragma vector aligned
   #endif
   for (int i = 0; i < _nlocal3; i++) {
     v[i] += _dtfm[i] * f[i];
@@ -108,8 +112,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
   // update angular momentum by 1/2 step
   if (igroup == 0) {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < nlocal; i++) {
       double *quat = bonus[ellipsoid[i]].quat;
@@ -118,8 +126,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
     }
   } else {
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & groupbit) {
@@ -143,8 +155,12 @@ void FixNVEAsphereIntel::final_integrate()
   const double * _noalias const torque = atom->torque[0];
 
   #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+  #pragma omp simd
+#else
   #pragma simd
+#endif
+  #pragma vector aligned
   #endif
   for (int i = 0; i < _nlocal3; i++) {
     v[i] += _dtfm[i] * f[i];
diff --git a/src/INTEL/fix_nve_intel.cpp b/src/INTEL/fix_nve_intel.cpp
index fb90946da0..9670af65c2 100644
--- a/src/INTEL/fix_nve_intel.cpp
+++ b/src/INTEL/fix_nve_intel.cpp
@@ -68,8 +68,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
   if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) {
     const double dtfm = dtf / atom->mass[1];
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++) {
       v[i] += dtfm * f[i];
@@ -78,8 +82,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
   } else if (igroup == 0) {
     if (neighbor->ago == 0) reset_dt();
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++) {
       v[i] += _dtfm[i] * f[i];
@@ -88,8 +96,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
   } else {
     if (neighbor->ago == 0) reset_dt();
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++) {
       if (_dtfm[i] != 0.0) {
@@ -112,16 +124,24 @@ void FixNVEIntel::final_integrate()
     _nlocal3 = 3 * atom->nlocal;
     const double dtfm = dtf / atom->mass[1];
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++)
       v[i] += dtfm * f[i];
   } else if (igroup == 0) {
     if (neighbor->ago == 0) reset_dt();
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++) {
       v[i] += _dtfm[i] * f[i];
@@ -129,8 +149,12 @@ void FixNVEIntel::final_integrate()
   } else {
     if (neighbor->ago == 0) reset_dt();
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
+    #pragma vector aligned
     #endif
     for (int i = 0; i < _nlocal3; i++)
       v[i] += _dtfm[i] * f[i];
diff --git a/src/INTEL/improper_cvff_intel.cpp b/src/INTEL/improper_cvff_intel.cpp
index 62dcde36b9..4d473de7aa 100644
--- a/src/INTEL/improper_cvff_intel.cpp
+++ b/src/INTEL/improper_cvff_intel.cpp
@@ -165,7 +165,11 @@ void ImproperCvffIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -247,7 +251,11 @@ void ImproperCvffIntel::eval(const int vflag,
 
       flt_t p, pd;
       #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (m == 2) {
@@ -319,7 +327,11 @@ void ImproperCvffIntel::eval(const int vflag,
       // apply force to each of 4 atoms
 
       #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/improper_harmonic_intel.cpp b/src/INTEL/improper_harmonic_intel.cpp
index b3d4c342d9..1a637fa1a6 100644
--- a/src/INTEL/improper_harmonic_intel.cpp
+++ b/src/INTEL/improper_harmonic_intel.cpp
@@ -167,7 +167,11 @@ void ImproperHarmonicIntel::eval(const int vflag,
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
     #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
     for (int n = nfrom; n < nto; n++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
@@ -276,7 +280,11 @@ void ImproperHarmonicIntel::eval(const int vflag,
       // apply force to each of 4 atoms
 
       #ifdef LMP_INTEL_USE_SIMDOFF
+#if defined(USE_OMP_SIMD)
+      #pragma omp ordered simd
+#else
       #pragma simdoff
+#endif
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
diff --git a/src/INTEL/intel_intrinsics.h b/src/INTEL/intel_intrinsics.h
index 295310283d..567f04c5dc 100644
--- a/src/INTEL/intel_intrinsics.h
+++ b/src/INTEL/intel_intrinsics.h
@@ -127,7 +127,8 @@ struct vector_ops<double, KNC> {
     }
     template<int scale>
     static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) {
-      return _mm512_mask_i32logather_pd(from, mask, idx, base, scale);
+      return _mm512_mask_i32gather_pd(from, mask, _mm512_castsi512_si256(idx),
+                                      base, scale);
     }
     static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
       return _mm512_mask_blend_pd(mask, a, b);
diff --git a/src/INTEL/intel_intrinsics_airebo.h b/src/INTEL/intel_intrinsics_airebo.h
index ac58ca2438..ea29888ea1 100644
--- a/src/INTEL/intel_intrinsics_airebo.h
+++ b/src/INTEL/intel_intrinsics_airebo.h
@@ -511,7 +511,8 @@ public:
                                      const int scale) {
     assert(scale == sizeof(FVEC_SCAL_T));
 #   if FVEC_LEN==8
-    return FVEC_SUFFIX(_mm512_i32logather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
+    return FVEC_SUFFIX(_mm512_i32gather_)(_mm512_castsi512_si256(idx.val_),
+                                          mem, sizeof(FVEC_SCAL_T));
 #   else
     return FVEC_SUFFIX(_mm512_i32gather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
 #   endif
@@ -522,8 +523,8 @@ public:
   ) {
     assert(scale == sizeof(FVEC_SCAL_T));
 #   if FVEC_LEN==8
-    return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
-                       mem, sizeof(FVEC_SCAL_T));
+    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_,
+                _mm512_castsi512_si256(idx.val_), mem, sizeof(FVEC_SCAL_T));
 #   else
     return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
                        mem, sizeof(FVEC_SCAL_T));
@@ -609,8 +610,8 @@ public:
   ) {
     assert(scale == sizeof(FVEC_SCAL_T));
 #   if FVEC_LEN==8
-    return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
-                                                 mem, sizeof(FVEC_SCAL_T));
+    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_,
+              _mm512_castsi512_si256(idx.val_), mem, sizeof(FVEC_SCAL_T));
 #   else
     return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
                                                mem, sizeof(FVEC_SCAL_T));
@@ -622,8 +623,9 @@ public:
   ) {
     assert(scale == sizeof(FVEC_SCAL_T));
 #   if FVEC_LEN==8
-    FVEC_SUFFIX(_mm512_mask_i32loscatter_)(mem, mask.val_, idx.val_, a.val_,
-                                           sizeof(FVEC_SCAL_T));
+    FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, 
+                                         _mm512_castsi512_si256(idx.val_),
+                                         a.val_, sizeof(FVEC_SCAL_T));
 #   else
     FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, idx.val_, a.val_,
                                          sizeof(FVEC_SCAL_T));
@@ -666,11 +668,11 @@ public:
       const double * mem, const int scale
   ) {
     assert(scale == sizeof(double));
-    __m512d lo = _mm512_mask_i32logather_pd(src.lo_, mask.val_, idx.val_, mem,
-                                            sizeof(double));
-    __m512d hi = _mm512_mask_i32logather_pd(src.hi_, get_bvec_hi(mask.val_),
-                                            get_ivec_hi(idx.val_), mem,
-                                            sizeof(double));
+    __m512d lo = _mm512_mask_i32gather_pd(src.lo_, mask.val_, 
+                                          _mm512_castsi512_si256(idx.val_),
+                                          mem, sizeof(double));
+    __m512d hi = _mm512_mask_i32gather_pd(src.hi_, get_bvec_hi(mask.val_),
+         _mm512_castsi512_si256(get_ivec_hi(idx.val_)), mem, sizeof(double));
     return avec16pd(lo, hi);
   }
   VEC_INLINE static void mask_i32loscatter(
@@ -678,10 +680,12 @@ public:
       const avec16pd &a, const int scale
   ) {
     assert(scale == sizeof(double));
-    _mm512_mask_i32loscatter_pd(mem, mask.val_, idx.val_, a.lo_,
-                                sizeof(double));
-    _mm512_mask_i32loscatter_pd(mem, get_bvec_hi(mask.val_),
-                                get_ivec_hi(idx.val_), a.hi_, sizeof(double));
+    _mm512_mask_i32scatter_pd(mem, mask.val_,
+                              _mm512_castsi512_si256(idx.val_), a.lo_,
+                              sizeof(double));
+    _mm512_mask_i32scatter_pd(mem, get_bvec_hi(mask.val_),
+                              _mm512_castsi512_si256(get_ivec_hi(idx.val_)),
+                              a.hi_, sizeof(double));
   }
 
   #define AVEC2_BINOP(the_sym, the_name)                                    \
diff --git a/src/INTEL/intel_preprocess.h b/src/INTEL/intel_preprocess.h
index 0bec9935db..41c91d1578 100644
--- a/src/INTEL/intel_preprocess.h
+++ b/src/INTEL/intel_preprocess.h
@@ -17,8 +17,13 @@
 ------------------------------------------------------------------------- */
 
 #ifdef __INTEL_LLVM_COMPILER
+#define USE_OMP_SIMD
 #define __INTEL_COMPILER __INTEL_LLVM_COMPILER
 #define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER
+#define _MM_SCALE_1 1
+#define _MM_SCALE_2 2
+#define _MM_SCALE_4 4
+#define _MM_SCALE_8 8
 #endif
 
 #ifdef __INTEL_COMPILER
@@ -332,6 +337,9 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 
 #endif
 
+// TO BE DEPRECATED
+#ifndef USE_OMP_SIMD 
+
 #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
                                   f_stride, pos, ov0, ov1, ov2,         \
                                   ov3, ov4, ov5)                        \
@@ -526,6 +534,198 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
   }                                                                     \
 }
 
+#else
+
+#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
+                                  f_stride, pos, ov0, ov1, ov2,         \
+                                  ov3, ov4, ov5)                        \
+{                                                                       \
+  acc_t *f_scalar = &f_start[0].x;                                      \
+  flt_t *x_scalar = &pos[minlocal].x;                                   \
+  int f_stride4 = f_stride * 4;                                         \
+  _alignvar(acc_t ovv[16],64);                                          \
+  int vwidth;                                                           \
+  if (sizeof(acc_t) == sizeof(double))                                  \
+    vwidth = INTEL_COMPILE_WIDTH/2;                                     \
+  else                                                                  \
+    vwidth = INTEL_COMPILE_WIDTH;                                       \
+  if (vwidth < 4) vwidth = 4;                                           \
+  _use_simd_pragma("omp simd aligned(ovv:64)")                          \
+  for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0;                 \
+  int remainder = lt % vwidth;                                          \
+  if (lf > lt) remainder = 0;                                           \
+  const int v_range = lt - remainder;                                   \
+  if (nthreads == 2) {                                                  \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,ovv,x_scalar:64)")\
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v];                                \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n];                                      \
+  } else if (nthreads==4) {                                             \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
+    acc_t *f_scalar4 = f_scalar3 + f_stride4;                           \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4,ovv:64)") \
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] +              \
+          f_scalar4[n+v];                                               \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];        \
+  } else if (nthreads==1) {                                             \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("omp simd aligned(ovv,f_scalar,x_scalar:64)")    \
+      for (int v = 0; v < vwidth; v++)                                  \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+  } else if (nthreads==3) {                                             \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,ovv,x_scalar:64)") \
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v];               \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n] + f_scalar3[n];                       \
+  }                                                                     \
+  for (int n = v_range; n < lt; n += 4) {                               \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    for (int v = 0; v < 4; v++)                                         \
+      ovv[v] += f_scalar[n+v] * x_scalar[n+v];                          \
+    ov3 += f_scalar[n+1] * x_scalar[n+0];                               \
+    ov4 += f_scalar[n+2] * x_scalar[n+0];                               \
+    ov5 += f_scalar[n+2] * x_scalar[n+1];                               \
+  }                                                                     \
+  ov0 += ovv[0];                                                        \
+  ov1 += ovv[1];                                                        \
+  ov2 += ovv[2];                                                        \
+  if (vwidth > 4) {                                                     \
+    ov0 += ovv[4];                                                      \
+    ov1 += ovv[5];                                                      \
+    ov2 += ovv[6];                                                      \
+  }                                                                     \
+  if (vwidth > 8) {                                                     \
+    ov0 += ovv[8] + ovv[12];                                            \
+    ov1 += ovv[9] + ovv[13];                                            \
+    ov2 += ovv[10] + ovv[14];                                           \
+  }                                                                     \
+}
+
+#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,       \
+                               f_stride, pos, offload, vflag, ov0, ov1, \
+                               ov2, ov3, ov4, ov5)                      \
+{                                                                       \
+  int o_range = (nall - minlocal) * 4;                                  \
+  IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads,       \
+                            sizeof(acc_t));                             \
+                                                                        \
+  acc_t *f_scalar = &f_start[0].x;                                      \
+  int f_stride4 = f_stride * 4;                                         \
+  int t;                                                                \
+  if (vflag == VIRIAL_FDOTR) t = 4; else t = 1;                         \
+  acc_t *f_scalar2 = f_scalar + f_stride4 * t;                          \
+  for ( ; t < nthreads; t++) {                                          \
+    _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2:64)")         \
+    for (int n = iifrom; n < iito; n++)                                 \
+      f_scalar[n] += f_scalar2[n];                                      \
+    f_scalar2 += f_stride4;                                             \
+  }                                                                     \
+                                                                        \
+  if (vflag == VIRIAL_FDOTR) {                                          \
+    int nt_min = MIN(4,nthreads);                                       \
+    IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start,  \
+                              f_stride, pos, ov0, ov1, ov2, ov3, ov4,   \
+                              ov5);                                     \
+  }                                                                     \
+}
+
+#endif
+
 #ifdef _LMP_INTEL_OFFLOAD
 #include <sys/time.h>
 
diff --git a/src/INTEL/intel_simd.h b/src/INTEL/intel_simd.h
index 165455a33d..eb5d9857a5 100644
--- a/src/INTEL/intel_simd.h
+++ b/src/INTEL/intel_simd.h
@@ -173,7 +173,7 @@ namespace ip_simd {
   }
 
   inline SIMD_double SIMD_gather(const double *p, const SIMD_int &i) {
-    return _mm512_i32logather_pd(i, p, _MM_SCALE_8);
+    return _mm512_i32gather_pd(_mm512_castsi512_si256(i), p, _MM_SCALE_8);
   }
 
   inline SIMD_int SIMD_gather(const SIMD_mask &m, const int *p,
@@ -190,8 +190,8 @@ namespace ip_simd {
 
   inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p,
                                  const SIMD_int &i) {
-    return _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, p,
-                                      _MM_SCALE_8);
+    return _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), p, _MM_SCALE_8);
   }
 
   template <typename T>
@@ -227,8 +227,8 @@ namespace ip_simd {
 
   inline SIMD_double SIMD_gatherz(const SIMD_mask &m, const double *p,
                                   const SIMD_int &i) {
-    return _mm512_mask_i32logather_pd( _mm512_set1_pd(0.0), m, i, p,
-                                      _MM_SCALE_8);
+    return _mm512_mask_i32gather_pd( _mm512_set1_pd(0.0), m,
+                                     _mm512_castsi512_si256(i),p, _MM_SCALE_8);
   }
 
   // ------- Store Operations
@@ -257,7 +257,8 @@ namespace ip_simd {
 
   inline void SIMD_scatter(const SIMD_mask &m, double *p,
                            const SIMD_int &i, const SIMD_double &vec) {
-    _mm512_mask_i32loscatter_pd(p, m, i, vec, _MM_SCALE_8);
+    _mm512_mask_i32scatter_pd(p, m, _mm512_castsi512_si256(i), vec,
+                              _MM_SCALE_8);
   }
 
   // ------- Arithmetic Operations
@@ -834,23 +835,29 @@ namespace ip_simd {
   inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
                                const SIMD_int &i, SIMD_double &x,
                                SIMD_double &y, SIMD_double &z) {
-    x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
-                                   _MM_SCALE_2);
-    y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
-                                   _MM_SCALE_2);
-    z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
-                                   _MM_SCALE_2);
+    x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                 _mm512_castsi512_si256(i), atom,
+                                 _MM_SCALE_2);
+    y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                 _mm512_castsi512_si256(i), atom+1,
+                                 _MM_SCALE_2);
+    z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                 _mm512_castsi512_si256(i), atom+2,
+                                 _MM_SCALE_2);
   }
 
   inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
                                const SIMD_int &i, SIMD_double &x,
                                SIMD_double &y, SIMD_double &z, SIMD_int &type) {
-    x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
-                                   _MM_SCALE_2);
-    y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
-                                   _MM_SCALE_2);
-    z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
-                                   _MM_SCALE_2);
+    x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                 _mm512_castsi512_si256(i), atom,
+                                 _MM_SCALE_2);
+    y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                 _mm512_castsi512_si256(i), atom+1,
+                                 _MM_SCALE_2);
+    z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                 _mm512_castsi512_si256(i), atom+2,
+                                 _MM_SCALE_2);
     type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3,
                                        _MM_SCALE_2);
   }
@@ -888,10 +895,12 @@ namespace ip_simd {
                                const SIMD_int &joffset, SIMD_double &eng) {
     SIMD_double jeng;
     SIMD_conflict_pi_reduce1(rmask, joffset, eng);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
+                                    force, _MM_SCALE_2);
     jeng = jeng + eng;
-    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
+                              jeng, _MM_SCALE_2);
   }
 
   inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force,
@@ -899,20 +908,24 @@ namespace ip_simd {
     SIMD_double engd, jeng;
     engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng));
     SIMD_conflict_pi_reduce1(rmask, joffset, engd);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
+                                    force, _MM_SCALE_2);
     jeng = jeng + engd;
-    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
+                              jeng, _MM_SCALE_2);
 
     SIMD_mask rmask2 = rmask >> 8;
     engd = _mm512_cvtps_pd(_mm512_castps512_ps256(
                              _mm512_shuffle_f32x4(eng,eng,238)));
     SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
     SIMD_conflict_pi_reduce1(rmask2, joffset2, engd);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
-                                      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, 
+                                    _mm512_castsi512_si256(joffset2),
+                                    force, _MM_SCALE_2);
     jeng = jeng + engd;
-    _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jeng, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2),
+                              jeng, _MM_SCALE_2);
   }
 
   inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force,
@@ -926,10 +939,12 @@ namespace ip_simd {
 
     SIMD_double jeng;
     SIMD_conflict_pi_reduce1(rmask, joffset, eng);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
+                                    force, _MM_SCALE_2);
     jeng = jeng + eng;
-    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
+                              jeng, _MM_SCALE_2);
   }
 
   inline void SIMD_safe_jforce(const SIMD_mask &m, float *force,
@@ -956,18 +971,24 @@ namespace ip_simd {
                                SIMD_double &fy, SIMD_double &fz) {
     SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
     SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
-                                      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                    _mm512_castsi512_si256(i), force,
+                                    _MM_SCALE_2);
     jfrc = jfrc + fx;
-    _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force + 1,
+                                    _MM_SCALE_2);
     jfrc = jfrc + fy;
-    _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                    _mm512_castsi512_si256(i), force + 2,
+                                    _MM_SCALE_2);
     jfrc = jfrc + fz;
-    _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
   }
 
   inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force,
@@ -979,40 +1000,54 @@ namespace ip_simd {
     amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz));
     SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd);
     SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
+                                    force, _MM_SCALE_2);
     jfrc = jfrc + amxd;
-    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force + 1, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
+                              jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 
+                                    _mm512_castsi512_si256(joffset),
+                                    force + 1, _MM_SCALE_2);
     jfrc = jfrc + amyd;
-    _mm512_mask_i32loscatter_pd(force+1, rmask, joffset, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
-                                      force + 2, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+1, rmask, _mm512_castsi512_si256(joffset),
+                              jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
+                                    _mm512_castsi512_si256(joffset),
+                                    force + 2, _MM_SCALE_2);
     jfrc = jfrc + amzd;
-    _mm512_mask_i32loscatter_pd(force+2, rmask, joffset, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, rmask, _mm512_castsi512_si256(joffset),
+                              jfrc, _MM_SCALE_2);
 
     SIMD_mask rmask2 = rmask >> 8;
     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-                                _mm512_shuffle_f32x4(amx,amx,238)));
+                                                  _mm512_shuffle_f32x4(amx,amx,238)));
     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-                                _mm512_shuffle_f32x4(amy,amy,238)));
+                                                  _mm512_shuffle_f32x4(amy,amy,238)));
     amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-                                _mm512_shuffle_f32x4(amz,amz,238)));
+                                                  _mm512_shuffle_f32x4(amz,amz,238)));
     SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
     SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
-                                      force, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, 
+                                    _mm512_castsi512_si256(joffset2),
+                                    force, _MM_SCALE_2);
     jfrc = jfrc + amxd;
-    _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
-                                      force + 1, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2),
+                              jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, 
+                                    _mm512_castsi512_si256(joffset2),
+                                    force + 1, _MM_SCALE_2);
     jfrc = jfrc + amyd;
-    _mm512_mask_i32loscatter_pd(force+1, rmask2, joffset2, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
-                                      force + 2, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+1, rmask2,
+                              _mm512_castsi512_si256(joffset2), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
+                                    _mm512_castsi512_si256(joffset2),
+                                    force + 2, _MM_SCALE_2);
     jfrc = jfrc + amzd;
-    _mm512_mask_i32loscatter_pd(force+2, rmask2, joffset2, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, rmask2, 
+                              _mm512_castsi512_si256(joffset2), jfrc,
+                              _MM_SCALE_2);
   }
 
   inline void SIMD_jforce_update(const SIMD_mask &m, float *force,
@@ -1064,18 +1099,24 @@ namespace ip_simd {
                                  const SIMD_int &i, const SIMD_double &fx,
                                  const SIMD_double &fy, const SIMD_double &fz)   {
     SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
-                                      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                    _mm512_castsi512_si256(i), force,
+                                    _MM_SCALE_2);
     jfrc = jfrc - fx;
-    _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force + 1,
+                                    _MM_SCALE_2);
     jfrc = jfrc - fy;
-    _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force + 2,
+                                    _MM_SCALE_2);
     jfrc = jfrc - fz;
-    _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
   }
 
   inline void SIMD_jforce_update(const SIMD_mask &rmask,
@@ -1502,11 +1543,12 @@ namespace ip_simd {
       fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
       fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
       SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
-                                                    hmask, k, force + 3,
-                                                    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask,
+                                                  _mm512_castsi512_si256(k),
+                                                  force + 3, _MM_SCALE_2);
       keng = keng + hevdwl;
-      _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
+      _mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k),
+                                keng, _MM_SCALE_2);
     }
   }
 
@@ -1523,11 +1565,12 @@ namespace ip_simd {
       fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
       fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
       SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
-                                                    hmask, k, force + 3,
-                                                    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask,
+                                                  _mm512_castsi512_si256(k),
+                                                  force + 3, _MM_SCALE_2);
       keng = keng + hevdwl;
-      _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
+      _mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k),
+                                keng, _MM_SCALE_2);
     }
     SIMD_mask hmask2 = hmask >> 8;
     facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(
@@ -1539,11 +1582,13 @@ namespace ip_simd {
       fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl);
       SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238);
       SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
-                                                    hmask2, k2, force + 3,
-                                                    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(),
+                                                  hmask2,
+                                                  _mm512_castsi512_si256(k2),
+                                                  force + 3, _MM_SCALE_2);
       keng = keng + hevdwl;
-      _mm512_mask_i32loscatter_pd(force + 3, hmask2, k2, keng, _MM_SCALE_2);
+      _mm512_mask_i32scatter_pd(force + 3, hmask2, _mm512_castsi512_si256(k2),
+                                keng, _MM_SCALE_2);
     }
   }
 
@@ -1815,24 +1860,32 @@ namespace ip_simd {
                                  const int EFLAG, const int eatom,
                                  const SIMD_double &fwtmp) {
     SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
-                                      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force,
+                                    _MM_SCALE_2);
     jfrc = jfrc + fx;
-    _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                    _mm512_castsi512_si256(i), force + 1,
+                                    _MM_SCALE_2);
     jfrc = jfrc + fy;
-    _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
-                                      _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
+    jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    _mm512_castsi512_si256(i), force + 2,
+                                    _MM_SCALE_2);
     jfrc = jfrc + fz;
-    _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
+    _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
+                              _MM_SCALE_2);
     if (EFLAG) {
       if (eatom) {
-        jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i,
-                                          force + 3, _MM_SCALE_2);
+        jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 
+                                        _mm512_castsi512_si256(i),
+                                        force + 3, _MM_SCALE_2);
         jfrc = jfrc + fwtmp;
-        _mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2);
+        _mm512_mask_i32scatter_pd(force+3, m, _mm512_castsi512_si256(i), jfrc,
+                                  _MM_SCALE_2);
       }
     }
   }
diff --git a/src/INTEL/npair_full_bin_ghost_intel.cpp b/src/INTEL/npair_full_bin_ghost_intel.cpp
index 082f95721f..e96f2c713d 100644
--- a/src/INTEL/npair_full_bin_ghost_intel.cpp
+++ b/src/INTEL/npair_full_bin_ghost_intel.cpp
@@ -324,7 +324,11 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
               const int bstart = binhead[ibin + binstart[k]];
               const int bend = binhead[ibin + binend[k]];
               #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+              #pragma omp simd
+#else
               #pragma simd
+#endif
               #endif
               for (int jj = bstart; jj < bend; jj++)
                 tj[ncount++] = binpacked[jj];
@@ -345,15 +349,23 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
               const int bstart = binhead[ibin + stencil[k]];
               const int bend = binhead[ibin + stencil[k] + 1];
               #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+              #pragma omp simd
+#else
               #pragma simd
+#endif
               #endif
               for (int jj = bstart; jj < bend; jj++)
                 tj[ncount++] = binpacked[jj];
             }
           } // if i < nlocal
           #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma vector aligned
           #endif
           for (int u = 0; u < ncount; u++) {
             const int j = tj[u];
@@ -425,12 +437,16 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
           int alln = n;
           n = 0;
           #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
           #ifdef LMP_INTEL_NBOR_COMPAT
           #pragma ivdep
           #else
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
           #endif
+          #pragma vector aligned
           #endif
           for (int u = 0; u < alln; u++) {
             int which;
@@ -454,12 +470,16 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
           alln = n2;
           n2 = maxnbors * 2;
           #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
           #ifdef LMP_INTEL_NBOR_COMPAT
           #pragma ivdep
           #else
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
           #endif
+          #pragma vector aligned
           #endif
           for (int u = n2; u < alln; u++) {
             int which;
diff --git a/src/INTEL/npair_intel.cpp b/src/INTEL/npair_intel.cpp
index 643ceff8f3..395e50006c 100644
--- a/src/INTEL/npair_intel.cpp
+++ b/src/INTEL/npair_intel.cpp
@@ -344,14 +344,22 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
             const int bstart = binhead[ibin + binstart[k]];
             const int bend = binhead[ibin + binend[k]];
             #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
             #pragma simd
+#endif
             #endif
             for (int jj = bstart; jj < bend; jj++)
               tj[ncount++] = binpacked[jj];
           }
           #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma vector aligned
           #endif
           for (int u = 0; u < ncount; u++) {
             const int j = tj[u];
@@ -375,7 +383,11 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
             const int bstart = binhead[ibin];
             const int bend = binhead[ibin + 1];
             #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
             #pragma simd
+#endif
             #endif
             for (int jj = bstart; jj < bend; jj++) {
               const int j = binpacked[jj];
@@ -533,12 +545,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
 
           n = pack_offset;
           #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
           #ifdef LMP_INTEL_NBOR_COMPAT
           #pragma ivdep
           #else
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
           #endif
+          #pragma vector aligned
           #endif
           for (int u = n; u < alln; u++) {
             int which;
@@ -566,12 +582,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
             n2 = pack_offset + maxnbors;
 
             #if defined(LMP_SIMD_COMPILER)
-            #pragma vector aligned
             #ifdef LMP_INTEL_NBOR_COMPAT
             #pragma ivdep
             #else
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
             #pragma simd
+#endif
             #endif
+            #pragma vector aligned
             #endif
             for (int u = n2; u < alln; u++) {
               int which;
@@ -737,8 +757,14 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
           int jnum = numneigh[i];
           if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
           #if __INTEL_COMPILER+0 > 1499
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd reduction(max:vlmax,vgmax) \
+            reduction(min:vlmin, vgmin)
+#else
+          #pragma simd reduction(max:vlmax,vgmax) \
+            reduction(min:vlmin, vgmin)
+#endif
           #pragma vector aligned
-          #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
           #endif
           for (int jj = 0; jj < jnum; jj++) {
             const int j = jlist[jj] & NEIGHMASK;
@@ -782,8 +808,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
           int jnum = numneigh[i];
           if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
           int jj = 0;
-          #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma vector aligned
           for (jj = 0; jj < jnum; jj++) {
             const int which = jlist[jj] >> SBBITS & 3;
             const int j = jlist[jj] & NEIGHMASK;
diff --git a/src/INTEL/pair_buck_coul_cut_intel.cpp b/src/INTEL/pair_buck_coul_cut_intel.cpp
index 99905bfaa0..c67450fbc1 100644
--- a/src/INTEL/pair_buck_coul_cut_intel.cpp
+++ b/src/INTEL/pair_buck_coul_cut_intel.cpp
@@ -248,12 +248,18 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
         fxtmp = fytmp = fztmp = (acc_t)0;
         if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
         if (NEWTON_PAIR == 0)
-          if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+          if (vflag == VIRIAL_PAIR)
+            sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)
+#else
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                                sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
+        #pragma vector aligned
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcecoul, forcebuck, evdwl, ecoul;
diff --git a/src/INTEL/pair_buck_coul_long_intel.cpp b/src/INTEL/pair_buck_coul_long_intel.cpp
index 1566ec23b6..7c795d5914 100644
--- a/src/INTEL/pair_buck_coul_long_intel.cpp
+++ b/src/INTEL/pair_buck_coul_long_intel.cpp
@@ -309,9 +309,14 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
         #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-                                 sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
           flt_t forcecoul, forcebuck, evdwl, ecoul;
diff --git a/src/INTEL/pair_buck_intel.cpp b/src/INTEL/pair_buck_intel.cpp
index 26ef13be9a..ddab17765b 100644
--- a/src/INTEL/pair_buck_intel.cpp
+++ b/src/INTEL/pair_buck_intel.cpp
@@ -230,12 +230,18 @@ void PairBuckIntel::eval(const int offload, const int vflag,
         fxtmp = fytmp = fztmp = (acc_t)0;
         if (EFLAG) fwtmp = sevdwl =  (acc_t)0;
         if (NEWTON_PAIR == 0)
-          if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+          if (vflag == VIRIAL_PAIR) 
+            sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)
+#else
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                                sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
+        #pragma vector aligned
         #endif
         for (int jj = 0; jj < jnum; jj++) {
 
diff --git a/src/INTEL/pair_dpd_intel.cpp b/src/INTEL/pair_dpd_intel.cpp
index e7514a1f95..a9eb4fe6a4 100644
--- a/src/INTEL/pair_dpd_intel.cpp
+++ b/src/INTEL/pair_dpd_intel.cpp
@@ -289,9 +289,14 @@ void PairDPDIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)
+#else
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-                                 sv0, sv1, sv2, sv3, sv4, sv5)
+                               sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
+        #pragma vector aligned
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcelj, evdwl;
diff --git a/src/INTEL/pair_eam_intel.cpp b/src/INTEL/pair_eam_intel.cpp
index dcff8957fd..13dbd60cb3 100644
--- a/src/INTEL/pair_eam_intel.cpp
+++ b/src/INTEL/pair_eam_intel.cpp
@@ -327,8 +327,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:rhoi)
+#else
         #pragma simd reduction(+:rhoi)
+#endif
+        #pragma vector aligned
         #endif
         for (int jj = 0; jj < ej; jj++) {
           int jtype;
@@ -369,23 +373,35 @@ void PairEAMIntel::eval(const int offload, const int vflag,
           const int rcount = nall;
           if (nthreads == 2) {
             double *trho2 = rho + nmax;
-            #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
             #pragma simd
+#endif
+            #pragma vector aligned
             for (int n = 0; n < rcount; n++)
               rho[n] += trho2[n];
           } else if (nthreads == 4) {
             double *trho2 = rho + nmax;
             double *trho3 = trho2 + nmax;
             double *trho4 = trho3 + nmax;
-            #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
             #pragma simd
+#endif
+            #pragma vector aligned
             for (int n = 0; n < rcount; n++)
               rho[n] += trho2[n] + trho3[n] + trho4[n];
           } else {
             double *trhon = rho + nmax;
             for (int t = 1; t < nthreads; t++) {
-              #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+              #pragma omp simd
+#else
               #pragma simd
+#endif
+              #pragma vector aligned
               for (int n = 0; n < rcount; n++)
                 rho[n] += trhon[n];
               trhon += nmax;
@@ -414,8 +430,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
       if (EFLAG) tevdwl = (acc_t)0.0;
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd reduction(+:tevdwl)
+#else
       #pragma simd reduction(+:tevdwl)
+#endif
+      #pragma vector aligned
       #endif
       for (int ii = iifrom; ii < iito; ++ii) {
         const int i = ilist[ii];
@@ -510,9 +530,14 @@ void PairEAMIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)
+#else
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-                                 sv0, sv1, sv2, sv3, sv4, sv5)
+                               sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
+        #pragma vector aligned
         #endif
         for (int jj = 0; jj < ej; jj++) {
           int jtype;
diff --git a/src/INTEL/pair_gayberne_intel.cpp b/src/INTEL/pair_gayberne_intel.cpp
index d7becc7585..c3abf68c12 100644
--- a/src/INTEL/pair_gayberne_intel.cpp
+++ b/src/INTEL/pair_gayberne_intel.cpp
@@ -449,9 +449,14 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
         __assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
         #endif
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp, \
+                                   t3tmp,sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
+#else
+        #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp, \
+                               t3tmp,sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
+#endif
         #pragma vector aligned
-        #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
-                                 sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
         #endif
         for (int jj = 0; jj < packed_j; jj++) {
           flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
@@ -806,8 +811,12 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
         acc_t *f_scalar2 = f_scalar + fst4;
         for (int t = 1; t < nthreads; t++) {
           #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma vector aligned
           #endif
           for (int n = iifrom * 8; n < sto; n++)
             f_scalar[n] += f_scalar2[n];
diff --git a/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp b/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
index ad8ef4d84f..ef26f8f2d5 100644
--- a/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
+++ b/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
@@ -294,9 +294,14 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
         #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-                               sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
           flt_t forcecoul, forcelj, evdwl;
diff --git a/src/INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
index a910c74acb..6f6bb3618e 100644
--- a/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
@@ -314,9 +314,14 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
         #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-                               sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
           flt_t forcecoul, forcelj, evdwl, ecoul;
diff --git a/src/INTEL/pair_lj_cut_coul_long_intel.cpp b/src/INTEL/pair_lj_cut_coul_long_intel.cpp
index 51e208314b..0d94fdb4c3 100644
--- a/src/INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/INTEL/pair_lj_cut_coul_long_intel.cpp
@@ -305,9 +305,14 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#else
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               secoul, sv0, sv1, sv2, sv3, sv4, sv5)
+#endif
         #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-                                 sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
           flt_t forcecoul, forcelj, evdwl, ecoul;
diff --git a/src/INTEL/pair_lj_cut_intel.cpp b/src/INTEL/pair_lj_cut_intel.cpp
index 84bc664e18..cf84cb3ca5 100644
--- a/src/INTEL/pair_lj_cut_intel.cpp
+++ b/src/INTEL/pair_lj_cut_intel.cpp
@@ -241,9 +241,15 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
           if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                   sv0, sv1, sv2, sv3, sv4, sv5)         \
+          aligned(jlist,x,ljc12oi,special_lj,f,lj34i:64)
+#else
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                                sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma vector aligned
+#endif
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcelj, evdwl;
diff --git a/src/INTEL/pair_sw_intel.cpp b/src/INTEL/pair_sw_intel.cpp
index 17dffa2843..57a6b29945 100644
--- a/src/INTEL/pair_sw_intel.cpp
+++ b/src/INTEL/pair_sw_intel.cpp
@@ -371,8 +371,12 @@ void PairSWIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
+#else
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
+#endif
+        #pragma vector aligned
         #endif
         for (int jj = 0; jj < ejnum_pad; jj++) {
           acc_t fjxtmp, fjytmp, fjztmp, fjtmp;
diff --git a/src/INTEL/pppm_disp_intel.cpp b/src/INTEL/pppm_disp_intel.cpp
index 8d4ed1778d..6b732ccfac 100644
--- a/src/INTEL/pppm_disp_intel.cpp
+++ b/src/INTEL/pppm_disp_intel.cpp
@@ -770,8 +770,12 @@ void PPPMDispIntel::particle_map(double delx, double dely, double delz,
     IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
 
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:flag)
+#else
     #pragma simd reduction(+:flag)
+#endif
+    #pragma vector aligned
     #endif
     for (int i = iifrom; i < iito; i++) {
 
@@ -876,7 +880,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho_lookup[idx][k];
@@ -885,7 +893,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower; k <= nupper; k++) {
           FFT_SCALAR r1,r2,r3;
@@ -917,8 +929,12 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           int mzy = m*nix + mz;
           FFT_SCALAR x0 = y0*rho[1][m];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mzyx = l + mzy;
@@ -939,7 +955,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
     IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
 
     #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
     #endif
     for (int i = ifrom; i < ito; i++) {
       for (int j = 1; j < nthr; j++) {
@@ -1025,7 +1045,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho6_lookup[idx][k];
@@ -1034,7 +1058,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1,r2,r3;
@@ -1067,8 +1095,12 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           int mzy = m*nix + mz;
           FFT_SCALAR x0 = y0*rho[1][m];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mzyx = l + mzy;
@@ -1089,7 +1121,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
     IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr);
 
     #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
     #endif
     for (int i = ifrom; i < ito; i++) {
       for (int j = 1; j < nthr; j++) {
@@ -1173,7 +1209,11 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho6_lookup[idx][k];
@@ -1182,7 +1222,11 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1,r2,r3;
@@ -1215,8 +1259,12 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           int my = m + nysum;
           FFT_SCALAR x0 = y0*rho[1][m];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mx = l + nxsum;
@@ -1307,7 +1355,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho6_lookup[idx][k];
@@ -1316,7 +1368,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1,r2,r3;
@@ -1349,8 +1405,12 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           int mzy = m*nix + mz;
           FFT_SCALAR x0 = y0*rho[1][m];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mzyx = l + mzy;
@@ -1373,7 +1433,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
     IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr);
 
     #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
     #endif
     for (int i = ifrom; i < ito; i++) {
       for (int j = 1; j < nthr; j++) {
@@ -1454,7 +1518,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho0[k] = rho_lookup[idx][k];
@@ -1463,7 +1531,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower; k <= nupper; k++) {
           FFT_SCALAR r1 = rho_coeff[order-1][k];
@@ -1498,8 +1570,12 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           int my = m+nysum;
           FFT_SCALAR y0 = z0*rho1[m];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mx = l+nxsum;
@@ -1624,7 +1700,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         int idz = dz;
 
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho_lookup[idx][k];
@@ -1636,7 +1716,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower; k <= nupper; k++) {
           FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@@ -1680,8 +1764,12 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
           FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mx = l + nxsum;
@@ -1702,7 +1790,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
       }
     }
     #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
     #endif
     for (int i = ifrom; i < ito; i++) {
       particle_ekx[i] *= hx_inv;
@@ -1802,7 +1894,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho0[k] = rho6_lookup[idx][k];
@@ -1811,7 +1907,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
@@ -1846,8 +1946,12 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           int my = m+nysum;
           FFT_SCALAR y0 = z0*rho1[m];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mx = l+nxsum;
@@ -1967,7 +2071,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         int idz = dz;
 
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho6_lookup[idx][k];
@@ -1979,7 +2087,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@@ -2023,8 +2135,12 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
           FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mx = l + nxsum;
@@ -2045,7 +2161,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
       }
     }
     #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
     #endif
     for (int i = ifrom; i < ito; i++) {
       particle_ekx[i] *= hx_inv;
@@ -2143,7 +2263,11 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho0[k] = rho6_lookup[idx][k];
@@ -2152,7 +2276,11 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
@@ -2206,8 +2334,12 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           int my = m+nysum;
           FFT_SCALAR y0 = z0*rho1[m];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mx = l+nxsum;
@@ -2398,7 +2530,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         int idz = dz;
 
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho6_lookup[idx][k];
@@ -2410,7 +2546,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@@ -2479,8 +2619,12 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
           FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
           FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int l = 0; l < order; l++) {
             int mx = l + nxsum;
@@ -2541,7 +2685,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
       }
     }
     #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
     #endif
     for (int i = ifrom; i < ito; i++) {
       particle_ekx0[i] *= hx_inv;
@@ -2671,7 +2819,11 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho0[k] = rho6_lookup[idx][k];
@@ -2680,7 +2832,11 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
@@ -2721,8 +2877,12 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
             int my = m+nysum;
             FFT_SCALAR y0 = z0*rho1[m];
             #if defined(LMP_SIMD_COMPILER)
-            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
             #pragma simd
+#endif
+            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
             #endif
             for (int l = 0; l < order; l++) {
               int mx = l+nxsum;
@@ -2848,7 +3008,11 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         int idz = dz;
 
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho6_lookup[idx][k];
@@ -2860,7 +3024,11 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower_6; k <= nupper_6; k++) {
           FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@@ -2909,8 +3077,12 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
             FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
             FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
             #if defined(LMP_SIMD_COMPILER)
-            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
+#if defined(USE_OMP_SIMD)
+            #pragma omp simd
+#else
             #pragma simd
+#endif
+            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
             #endif
             for (int l = 0; l < order; l++) {
               int mx = l + nxsum;
@@ -2992,7 +3164,11 @@ void PPPMDispIntel::precompute_rho()
     for (int i = 0; i < rho_points; i++) {
       FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
       #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd
+#else
       #pragma simd
+#endif
       #endif
       for (int k=nlower; k<=nupper;k++) {
         FFT_SCALAR r1 = ZEROF;
@@ -3006,7 +3182,11 @@ void PPPMDispIntel::precompute_rho()
       }
       if (differentiation_flag == 1) {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k=nlower; k<=nupper;k++) {
           FFT_SCALAR r1 = ZEROF;
@@ -3026,7 +3206,11 @@ void PPPMDispIntel::precompute_rho()
     for (int i = 0; i < rho_points; i++) {
       FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
       #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd
+#else
       #pragma simd
+#endif
       #endif
       for (int k=nlower_6; k<=nupper_6;k++) {
         FFT_SCALAR r1 = ZEROF;
@@ -3040,7 +3224,11 @@ void PPPMDispIntel::precompute_rho()
       }
       if (differentiation_flag == 1) {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k=nlower_6; k<=nupper_6;k++) {
           FFT_SCALAR r1 = ZEROF;
diff --git a/src/INTEL/pppm_intel.cpp b/src/INTEL/pppm_intel.cpp
index 8b0542d770..8041709ebc 100644
--- a/src/INTEL/pppm_intel.cpp
+++ b/src/INTEL/pppm_intel.cpp
@@ -394,8 +394,12 @@ void PPPMIntel::particle_map(IntelBuffers<flt_t,acc_t> *buffers)
     IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
 
     #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd reduction(+:flag)
+#else
     #pragma simd reduction(+:flag)
+#endif
+    #pragma vector aligned
     #endif
     for (int i = iifrom; i < iito; i++) {
 
@@ -500,7 +504,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho_lookup[idx][k];
@@ -509,7 +517,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower; k <= nupper; k++) {
           FFT_SCALAR r1,r2,r3;
@@ -541,7 +553,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
           int mzy = m*nix + mz;
           FFT_SCALAR x0 = y0*rho[1][m];
           #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
           #endif
           for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
             int mzyx = l + mzy;
@@ -563,7 +579,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
       IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
 
       #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd
+#else
       #pragma simd
+#endif
       #endif
       for (int i = ifrom; i < ito; i++) {
         for (int j = 1; j < nthr; j++) {
@@ -645,7 +665,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho0[k] = rho_lookup[idx][k];
@@ -654,7 +678,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower; k <= nupper; k++) {
           FFT_SCALAR r1 = rho_coeff[order-1][k];
@@ -690,7 +718,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
           int my = m+nysum;
           FFT_SCALAR y0 = z0*rho1[m];
           #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
           #endif
           for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
             int mx = l+nxsum;
@@ -813,7 +845,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
         dz = dz*half_rho_scale + half_rho_scale_plus;
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho_lookup[idx][k];
@@ -825,7 +861,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+        #pragma omp simd
+#else
         #pragma simd
+#endif
         #endif
         for (int k = nlower; k <= nupper; k++) {
           FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@@ -871,7 +911,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
           FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
           FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
           #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+          #pragma omp simd
+#else
           #pragma simd
+#endif
           #endif
           for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
             int mx = l + nxsum;
@@ -893,7 +937,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
     }
 
     #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
     #endif
     for (int i = ifrom; i < ito; i++) {
       particle_ekx[i] *= hx_inv;
@@ -942,7 +990,11 @@ void PPPMIntel::precompute_rho()
   for (int i = 0; i < rho_points; i++) {
     FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
     #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+    #pragma omp simd
+#else
     #pragma simd
+#endif
     #endif
     for (int k=nlower; k<=nupper;k++) {
       FFT_SCALAR r1 = ZEROF;
@@ -956,7 +1008,11 @@ void PPPMIntel::precompute_rho()
     }
     if (differentiation_flag == 1) {
       #if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+      #pragma omp simd
+#else
       #pragma simd
+#endif
       #endif
       for (int k=nlower; k<=nupper;k++) {
         FFT_SCALAR r1 = ZEROF;