Merge pull request #3410 from wmbrownIntel/icx_gather_opt

Changes to Intel Package files for better LLVM-based compiler support
2022-08-23 11:00:44 -04:00
parent 66bbfa67dc f7cf5b6751
commit 48732ff376
25 changed files with 83 additions and 68 deletions
--- a/src/INTEL/angle_charmm_intel.cpp
+++ b/src/INTEL/angle_charmm_intel.cpp
@ -166,10 +166,10 @@ void AngleCharmmIntel::eval(const int vflag,
    #else
    for (int n = nfrom; n < nto; n += npl) {
    #endif
-      const int i1 = anglelist[n].a;
-      const int i2 = anglelist[n].b;
-      const int i3 = anglelist[n].c;
-      const int type = anglelist[n].t;
+      const int i1 = IP_PRE_dword_index(anglelist[n].a);
+      const int i2 = IP_PRE_dword_index(anglelist[n].b);
+      const int i3 = IP_PRE_dword_index(anglelist[n].c);
+      const int type = IP_PRE_dword_index(anglelist[n].t);

      // 1st bond

--- a/src/INTEL/angle_harmonic_intel.cpp
+++ b/src/INTEL/angle_harmonic_intel.cpp
@ -166,10 +166,10 @@ void AngleHarmonicIntel::eval(const int vflag,
    #else
    for (int n = nfrom; n < nto; n += npl) {
    #endif
-      const int i1 = anglelist[n].a;
-      const int i2 = anglelist[n].b;
-      const int i3 = anglelist[n].c;
-      const int type = anglelist[n].t;
+      const int i1 = IP_PRE_dword_index(anglelist[n].a);
+      const int i2 = IP_PRE_dword_index(anglelist[n].b);
+      const int i3 = IP_PRE_dword_index(anglelist[n].c);
+      const int type = IP_PRE_dword_index(anglelist[n].t);

      // 1st bond

--- a/src/INTEL/bond_fene_intel.cpp
+++ b/src/INTEL/bond_fene_intel.cpp
@ -163,9 +163,9 @@ void BondFENEIntel::eval(const int vflag,
    #else
    for (int n = nfrom; n < nto; n += npl) {
    #endif
-      const int i1 = bondlist[n].a;
-      const int i2 = bondlist[n].b;
-      const int type = bondlist[n].t;
+      const int i1 = IP_PRE_dword_index(bondlist[n].a);
+      const int i2 = IP_PRE_dword_index(bondlist[n].b);
+      const int type = IP_PRE_dword_index(bondlist[n].t);

      const flt_t ir0sq = fc.fc[type].ir0sq;
      const flt_t k = fc.fc[type].k;
--- a/src/INTEL/bond_harmonic_intel.cpp
+++ b/src/INTEL/bond_harmonic_intel.cpp
@ -159,9 +159,9 @@ void BondHarmonicIntel::eval(const int vflag,
    #else
    for (int n = nfrom; n < nto; n += npl) {
    #endif
-      const int i1 = bondlist[n].a;
-      const int i2 = bondlist[n].b;
-      const int type = bondlist[n].t;
+      const int i1 = IP_PRE_dword_index(bondlist[n].a);
+      const int i2 = IP_PRE_dword_index(bondlist[n].b);
+      const int type = IP_PRE_dword_index(bondlist[n].t);

      const flt_t delx = x[i1].x - x[i2].x;
      const flt_t dely = x[i1].y - x[i2].y;
--- a/src/INTEL/dihedral_charmm_intel.cpp
+++ b/src/INTEL/dihedral_charmm_intel.cpp
@ -195,11 +195,11 @@ void DihedralCharmmIntel::eval(const int vflag,
    for (int n = nfrom; n < nto; n++) {
    #endif
    for (int n = nfrom; n < nto; n += npl) {
-      const int i1 = dihedrallist[n].a;
-      const int i2 = dihedrallist[n].b;
-      const int i3 = dihedrallist[n].c;
-      const int i4 = dihedrallist[n].d;
-      const int type = dihedrallist[n].t;
+      const int i1 = IP_PRE_dword_index(dihedrallist[n].a);
+      const int i2 = IP_PRE_dword_index(dihedrallist[n].b);
+      const int i3 = IP_PRE_dword_index(dihedrallist[n].c);
+      const int i4 = IP_PRE_dword_index(dihedrallist[n].d);
+      const int type = IP_PRE_dword_index(dihedrallist[n].t);

      // 1st bond

--- a/src/INTEL/dihedral_harmonic_intel.cpp
+++ b/src/INTEL/dihedral_harmonic_intel.cpp
@ -163,11 +163,11 @@ void DihedralHarmonicIntel::eval(const int vflag,
    #else
    for (int n = nfrom; n < nto; n += npl) {
    #endif
-      const int i1 = dihedrallist[n].a;
-      const int i2 = dihedrallist[n].b;
-      const int i3 = dihedrallist[n].c;
-      const int i4 = dihedrallist[n].d;
-      const int type = dihedrallist[n].t;
+      const int i1 = IP_PRE_dword_index(dihedrallist[n].a);
+      const int i2 = IP_PRE_dword_index(dihedrallist[n].b);
+      const int i3 = IP_PRE_dword_index(dihedrallist[n].c);
+      const int i4 = IP_PRE_dword_index(dihedrallist[n].d);
+      const int type = IP_PRE_dword_index(dihedrallist[n].t);

      // 1st bond

--- a/src/INTEL/dihedral_opls_intel.cpp
+++ b/src/INTEL/dihedral_opls_intel.cpp
@ -167,11 +167,11 @@ void DihedralOPLSIntel::eval(const int vflag,
    #else
    for (int n = nfrom; n < nto; n += npl) {
    #endif
-      const int i1 = dihedrallist[n].a;
-      const int i2 = dihedrallist[n].b;
-      const int i3 = dihedrallist[n].c;
-      const int i4 = dihedrallist[n].d;
-      const int type = dihedrallist[n].t;
+      const int i1 = IP_PRE_dword_index(dihedrallist[n].a);
+      const int i2 = IP_PRE_dword_index(dihedrallist[n].b);
+      const int i3 = IP_PRE_dword_index(dihedrallist[n].c);
+      const int i4 = IP_PRE_dword_index(dihedrallist[n].d);
+      const int type = IP_PRE_dword_index(dihedrallist[n].t);

      // 1st bond

--- a/src/INTEL/fix_nh_intel.cpp
+++ b/src/INTEL/fix_nh_intel.cpp
@ -22,6 +22,7 @@
 #include "domain.h"
 #include "error.h"
 #include "force.h"
+#include "intel_preprocess.h"
 #include "memory.h"
 #include "modify.h"
 #include "neighbor.h"
@ -100,6 +101,7 @@ void FixNHIntel::remap()
    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
+      i = IP_PRE_dword_index(i);
      const double d0 = x[i].x - b0;
      const double d1 = x[i].y - b1;
      const double d2 = x[i].z - b2;
@ -118,6 +120,7 @@ void FixNHIntel::remap()
    #endif
    for (int i = 0; i < nlocal; i++) {
      if (mask[i] & dilate_group_bit) {
+        i = IP_PRE_dword_index(i);
        const double d0 = x[i].x - b0;
        const double d1 = x[i].y - b1;
        const double d2 = x[i].z - b2;
@ -287,6 +290,7 @@ void FixNHIntel::remap()
    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
+      i = IP_PRE_dword_index(i);
      x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
      x[i].y = h1*x[i].y + h3*x[i].z + nb1;
      x[i].z = h2*x[i].z + nb2;
@ -302,6 +306,7 @@ void FixNHIntel::remap()
    #endif
    for (int i = 0; i < nlocal; i++) {
      if (mask[i] & dilate_group_bit) {
+        i = IP_PRE_dword_index(i);
        x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
        x[i].y = h1*x[i].y + h3*x[i].z + nb1;
        x[i].z = h2*x[i].z + nb2;
@ -432,6 +437,7 @@ void FixNHIntel::nh_v_press()
    #pragma vector aligned
    #endif
    for (int i = 0; i < nlocal; i++) {
+      i = IP_PRE_dword_index(i);
      v[i].x *= f0;
      v[i].y *= f1;
      v[i].z *= f2;
@ -447,6 +453,7 @@ void FixNHIntel::nh_v_press()
    #endif
    for (int i = 0; i < nlocal; i++) {
      if (mask[i] & groupbit) {
+        i = IP_PRE_dword_index(i);
        v[i].x *= f0;
        v[i].y *= f1;
        v[i].z *= f2;
--- a/src/INTEL/improper_cvff_intel.cpp
+++ b/src/INTEL/improper_cvff_intel.cpp
@ -168,11 +168,11 @@ void ImproperCvffIntel::eval(const int vflag,
    #else
    for (int n = nfrom; n < nto; n += npl) {
    #endif
-      const int i1 = improperlist[n].a;
-      const int i2 = improperlist[n].b;
-      const int i3 = improperlist[n].c;
-      const int i4 = improperlist[n].d;
-      const int type = improperlist[n].t;
+      const int i1 = IP_PRE_dword_index(improperlist[n].a);
+      const int i2 = IP_PRE_dword_index(improperlist[n].b);
+      const int i3 = IP_PRE_dword_index(improperlist[n].c);
+      const int i4 = IP_PRE_dword_index(improperlist[n].d);
+      const int type = IP_PRE_dword_index(improperlist[n].t);

      // geometry of 4-body

--- a/src/INTEL/improper_harmonic_intel.cpp
+++ b/src/INTEL/improper_harmonic_intel.cpp
@ -170,11 +170,11 @@ void ImproperHarmonicIntel::eval(const int vflag,
    #else
    for (int n = nfrom; n < nto; n += npl) {
    #endif
-      const int i1 = improperlist[n].a;
-      const int i2 = improperlist[n].b;
-      const int i3 = improperlist[n].c;
-      const int i4 = improperlist[n].d;
-      const int type = improperlist[n].t;
+      const int i1 = IP_PRE_dword_index(improperlist[n].a);
+      const int i2 = IP_PRE_dword_index(improperlist[n].b);
+      const int i3 = IP_PRE_dword_index(improperlist[n].c);
+      const int i4 = IP_PRE_dword_index(improperlist[n].d);
+      const int type = IP_PRE_dword_index(improperlist[n].t);

      // geometry of 4-body

--- a/src/INTEL/intel_preprocess.h
+++ b/src/INTEL/intel_preprocess.h
@ -16,10 +16,16 @@
   Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */

+#include "lmptype.h"
+
 #ifdef __INTEL_LLVM_COMPILER
 #define USE_OMP_SIMD
 #define __INTEL_COMPILER __INTEL_LLVM_COMPILER
 #define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER
+// Indicate to vectorizer that it is safe to use dword indexed gather
+#define IP_PRE_dword_index(i) ((i) & NEIGHMASK)
+#else
+#define IP_PRE_dword_index(i) i
 #endif

 #ifdef __INTEL_COMPILER
--- a/src/INTEL/npair_full_bin_ghost_intel.cpp
+++ b/src/INTEL/npair_full_bin_ghost_intel.cpp
@ -370,7 +370,7 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
          #pragma vector aligned
          #endif
          for (int u = 0; u < ncount; u++) {
-            const int j = tj[u];
+            const int j = IP_PRE_dword_index(tj[u]);
            tx[u] = x[j].x;
            ty[u] = x[j].y;
            tz[u] = x[j].z;
--- a/src/INTEL/npair_intel.cpp
+++ b/src/INTEL/npair_intel.cpp
@ -359,7 +359,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
          #pragma vector aligned
          #endif
          for (int u = 0; u < ncount; u++) {
-            const int j = tj[u];
+            const int j = IP_PRE_dword_index(tj[u]);
            tx[u] = x[j].x;
            ty[u] = x[j].y;
            tz[u] = x[j].z;
@ -387,7 +387,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
 #endif
            #endif
            for (int jj = bstart; jj < bend; jj++) {
-              const int j = binpacked[jj];
+              const int j = IP_PRE_dword_index(binpacked[jj]);
              itj[icount] = j;
              itx[icount] = x[j].x;
              ity[icount] = x[j].y;
--- a/src/INTEL/pair_buck_coul_cut_intel.cpp
+++ b/src/INTEL/pair_buck_coul_cut_intel.cpp
@ -265,7 +265,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
          const flt_t delx = xtmp - x[j].x;
          const flt_t dely = ytmp - x[j].y;
          const flt_t delz = ztmp - x[j].z;
-          const int jtype = x[j].w;
+          const int jtype = IP_PRE_dword_index(x[j].w);
          const flt_t rsq = delx * delx + dely * dely + delz * delz;
          const flt_t r = sqrt(rsq);
          const flt_t r2inv = (flt_t)1.0 / rsq;
--- a/src/INTEL/pair_buck_coul_long_intel.cpp
+++ b/src/INTEL/pair_buck_coul_long_intel.cpp
@ -289,7 +289,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
          const flt_t delx = xtmp - x[j].x;
          const flt_t dely = ytmp - x[j].y;
          const flt_t delz = ztmp - x[j].z;
-          const int jtype = x[j].w;
+          const int jtype = IP_PRE_dword_index(x[j].w);
          const flt_t rsq = delx * delx + dely * dely + delz * delz;

          if (rsq < c_forcei[jtype].cutsq) {
--- a/src/INTEL/pair_buck_intel.cpp
+++ b/src/INTEL/pair_buck_intel.cpp
@ -253,7 +253,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
          const flt_t delx = xtmp - x[j].x;
          const flt_t dely = ytmp - x[j].y;
          const flt_t delz = ztmp - x[j].z;
-          const int jtype = x[j].w;
+          const int jtype = IP_PRE_dword_index(x[j].w);
          const flt_t rsq = delx * delx + dely * dely + delz * delz;
          const flt_t r = sqrt(rsq);
          const flt_t r2inv = (flt_t)1.0 / rsq;
--- a/src/INTEL/pair_dpd_intel.cpp
+++ b/src/INTEL/pair_dpd_intel.cpp
@ -312,13 +312,13 @@ void PairDPDIntel::eval(const int offload, const int vflag,
            sbindex = jlist[jj] >> SBBITS & 3;
            j = jlist[jj] & NEIGHMASK;
          } else
-            j = jlist[jj];
+            j = IP_PRE_dword_index(jlist[jj]);

          const flt_t delx = xtmp - x[j].x;
          const flt_t dely = ytmp - x[j].y;
          const flt_t delz = ztmp - x[j].z;
          if (!ONETYPE) {
-            jtype = x[j].w;
+            jtype = IP_PRE_dword_index(x[j].w);
            icut = parami[jtype].icut;
          }
          const flt_t rsq = delx * delx + dely * dely + delz * delz;
--- a/src/INTEL/pair_eam_intel.cpp
+++ b/src/INTEL/pair_eam_intel.cpp
@ -347,14 +347,15 @@ void PairEAMIntel::eval(const int offload, const int vflag,
          p = MIN(p,(flt_t)1.0);
          if (!ONETYPE)
            rhor_joff = rhor_ioff + jtype * jstride;
-          const int joff = rhor_joff + m;
+          const int joff = IP_PRE_dword_index(rhor_joff + m);
          flt_t ra;
          ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p +
                rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d;
          rhoi += ra;
          if (NEWTON_PAIR) {
            if (!ONETYPE) {
-              const int ioff = jtype * istride + itype * jstride + m;
+              const int ioff = IP_PRE_dword_index(jtype * istride + itype *
+                                                  jstride + m);
              ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p +
                    rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d;
            }
@ -439,7 +440,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
      #pragma vector aligned
      #endif
      for (int ii = iifrom; ii < iito; ++ii) {
-        const int i = ilist[ii];
+        const int i = IP_PRE_dword_index(ilist[ii]);
        int itype;
        if (!ONETYPE) itype = x[i].w;
        flt_t p = rho[i]*frdrho + (flt_t)1.0;
@ -448,7 +449,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
        p -= m;
        p = MIN(p,(flt_t)1.0);
        if (!ONETYPE) frho_ioff = itype * fstride;
-        const int ioff = frho_ioff + m;
+        const int ioff = IP_PRE_dword_index(frho_ioff + m);
        fp_f[i] = (frho_spline_f[ioff].a*p + frho_spline_f[ioff].b)*p +
          frho_spline_f[ioff].c;
        if (EFLAG) {
@ -553,13 +554,14 @@ void PairEAMIntel::eval(const int offload, const int vflag,
          p = MIN(p,(flt_t)1.0);
          if (!ONETYPE)
            rhor_joff = rhor_ioff + jtype * jstride;
-          const int joff = rhor_joff + m;
+          const int joff = IP_PRE_dword_index(rhor_joff + m);
          const flt_t rhojp = (rhor_spline_f[joff].a*p +
                               rhor_spline_f[joff].b)*p +
            rhor_spline_f[joff].c;
          flt_t rhoip;
          if (!ONETYPE) {
-            const int ioff = jtype * istride + itype * jstride + m;
+            const int ioff = IP_PRE_dword_index(jtype * istride +
+                                                itype * jstride + m);
            rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p +
              rhor_spline_f[ioff].c;
          } else
--- a/src/INTEL/pair_gayberne_intel.cpp
+++ b/src/INTEL/pair_gayberne_intel.cpp
@ -417,7 +417,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
        for (int jj = 0; jj < jnum; jj++) {
          int jm = jlist[jj];
          int j = jm & NEIGHMASK;
-          const int jtype = x[j].w;
+          const int jtype = IP_PRE_dword_index(x[j].w);

          if (ijci[jtype].form == ELLIPSE_ELLIPSE) {
            flt_t delx = x[j].x-xtmp;
@ -473,7 +473,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
          const int sbindex = jlist_form[jj] >> SBBITS & 3;
          const int j = jlist_form[jj] & NEIGHMASK;
          flt_t factor_lj = special_lj[sbindex];
-          const int jtype = jtype_form[jj];
+          const int jtype = IP_PRE_dword_index(jtype_form[jj]);
          const flt_t sigma = ijci[jtype].sigma;
          const flt_t epsilon = ijci[jtype].epsilon;
          const flt_t shape2_0 = ic[jtype].shape2[0];
--- a/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
+++ b/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp
@ -318,7 +318,7 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
          #ifdef INTEL_VMASK
          if (rsq < cut_ljsq) {
          #endif
-            const int jtype = tjtype[jj];
+            const int jtype = IP_PRE_dword_index(tjtype[jj]);
            flt_t r6inv = r2inv * r2inv * r2inv;
            forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
            if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
--- a/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/INTEL/pair_lj_charmm_coul_long_intel.cpp
@ -324,7 +324,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,

          const int j = tj[jj] & NEIGHMASK;
          const int sbindex = tj[jj] >> SBBITS & 3;
-          const int jtype = tjtype[jj];
+          const int jtype = IP_PRE_dword_index(tjtype[jj]);
          const flt_t rsq = trsq[jj];
          const flt_t r2inv = (flt_t)1.0 / rsq;

--- a/src/INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/INTEL/pair_lj_cut_coul_long_intel.cpp
@ -287,7 +287,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
          const flt_t delx = xtmp - x[j].x;
          const flt_t dely = ytmp - x[j].y;
          const flt_t delz = ztmp - x[j].z;
-          const int jtype = x[j].w;
+          const int jtype = IP_PRE_dword_index(x[j].w);
          const flt_t rsq = delx * delx + dely * dely + delz * delz;

          if (rsq < c_forcei[jtype].cutsq) {
@ -316,8 +316,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
          forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;

          const int j = tj[jj] & NEIGHMASK;
-          const int sbindex = tj[jj] >> SBBITS & 3;
-          const int jtype = tjtype[jj];
+          const int sbindex = IP_PRE_dword_index(tj[jj] >> SBBITS & 3);
+          const int jtype = IP_PRE_dword_index(tjtype[jj]);
          const flt_t rsq = trsq[jj];
          const flt_t r2inv = (flt_t)1.0 / rsq;

--- a/src/INTEL/pair_lj_cut_intel.cpp
+++ b/src/INTEL/pair_lj_cut_intel.cpp
@ -262,13 +262,13 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
            sbindex = jlist[jj] >> SBBITS & 3;
            j = jlist[jj] & NEIGHMASK;
          } else
-            j = jlist[jj];
+            j = IP_PRE_dword_index(jlist[jj]);

          const flt_t delx = xtmp - x[j].x;
          const flt_t dely = ytmp - x[j].y;
          const flt_t delz = ztmp - x[j].z;
          if (!ONETYPE) {
-            jtype = x[j].w;
+            jtype = IP_PRE_dword_index(x[j].w);
            cutsq = ljc12oi[jtype].cutsq;
          }
          const flt_t rsq = delx * delx + dely * dely + delz * delz;
--- a/src/INTEL/pair_sw_intel.cpp
+++ b/src/INTEL/pair_sw_intel.cpp
@ -332,7 +332,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
          int jtype, ijtype;
          if (!ONETYPE) {
            jtype = x[j].w;
-            ijtype = itype_offset + jtype;
+            ijtype = IP_PRE_dword_index(itype_offset + jtype);
            cutsq = p2[ijtype].cutsq;
          }
          const flt_t rsq1 = delx * delx + dely * dely + delz * delz;
@ -378,7 +378,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
          if (EFLAG) fjtmp = (acc_t)0.0;
          int ijtype;

-          if (!ONETYPE) ijtype = tjtype[jj] + itype_offset;
+          if (!ONETYPE) ijtype = IP_PRE_dword_index(tjtype[jj] + itype_offset);
          const flt_t rsq1 = trsq[jj];

          const flt_t rinvsq1 = (flt_t)1.0 / rsq1;
@ -459,8 +459,8 @@ void PairSWIntel::eval(const int offload, const int vflag,
            int iktype, ijktype;
            if (!ONETYPE) {
              iktype = tjtype[kk];
-              ijktype = ijkoff + iktype;
-              iktype += itype_offset;
+              ijktype = IP_PRE_dword_index(ijkoff + iktype);
+              iktype = IP_PRE_dword_index(iktype + itype_offset);
              cut = p2[iktype].cut;
              sigma_gamma = p2[iktype].sigma_gamma;
              costheta = p3[ijktype].costheta;
@ -520,7 +520,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
              }
            }
          } // for kk
-          const int j = tj[jj];
+          const int j = IP_PRE_dword_index(tj[jj]);
          f[j].x += fjxtmp;
          f[j].y += fjytmp;
          f[j].z += fjztmp;
--- a/src/INTEL/pppm_intel.cpp
+++ b/src/INTEL/pppm_intel.cpp
@ -403,7 +403,6 @@ void PPPMIntel::particle_map(IntelBuffers<flt_t,acc_t> *buffers)
      // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
      // current particle coord can be outside global and local box
      // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
-
      int nx = static_cast<int> ((x[i].x-lo0)*xi+fshift) - OFFSET;
      int ny = static_cast<int> ((x[i].y-lo1)*yi+fshift) - OFFSET;
      int nz = static_cast<int> ((x[i].z-lo2)*zi+fshift) - OFFSET;
@ -941,6 +940,7 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
 #endif
    #endif
    for (int i = ifrom; i < ito; i++) {
+      i = IP_PRE_dword_index(i);
      particle_ekx[i] *= hx_inv;
      particle_eky[i] *= hy_inv;
      particle_ekz[i] *= hz_inv;