Merge pull request #3410 from wmbrownIntel/icx_gather_opt

Changes to Intel Package files for better LLVM-based compiler support
This commit is contained in:
Axel Kohlmeyer
2022-08-23 11:00:44 -04:00
committed by GitHub
25 changed files with 83 additions and 68 deletions

View File

@ -166,10 +166,10 @@ void AngleCharmmIntel::eval(const int vflag,
#else
for (int n = nfrom; n < nto; n += npl) {
#endif
const int i1 = anglelist[n].a;
const int i2 = anglelist[n].b;
const int i3 = anglelist[n].c;
const int type = anglelist[n].t;
const int i1 = IP_PRE_dword_index(anglelist[n].a);
const int i2 = IP_PRE_dword_index(anglelist[n].b);
const int i3 = IP_PRE_dword_index(anglelist[n].c);
const int type = IP_PRE_dword_index(anglelist[n].t);
// 1st bond

View File

@ -166,10 +166,10 @@ void AngleHarmonicIntel::eval(const int vflag,
#else
for (int n = nfrom; n < nto; n += npl) {
#endif
const int i1 = anglelist[n].a;
const int i2 = anglelist[n].b;
const int i3 = anglelist[n].c;
const int type = anglelist[n].t;
const int i1 = IP_PRE_dword_index(anglelist[n].a);
const int i2 = IP_PRE_dword_index(anglelist[n].b);
const int i3 = IP_PRE_dword_index(anglelist[n].c);
const int type = IP_PRE_dword_index(anglelist[n].t);
// 1st bond

View File

@ -163,9 +163,9 @@ void BondFENEIntel::eval(const int vflag,
#else
for (int n = nfrom; n < nto; n += npl) {
#endif
const int i1 = bondlist[n].a;
const int i2 = bondlist[n].b;
const int type = bondlist[n].t;
const int i1 = IP_PRE_dword_index(bondlist[n].a);
const int i2 = IP_PRE_dword_index(bondlist[n].b);
const int type = IP_PRE_dword_index(bondlist[n].t);
const flt_t ir0sq = fc.fc[type].ir0sq;
const flt_t k = fc.fc[type].k;

View File

@ -159,9 +159,9 @@ void BondHarmonicIntel::eval(const int vflag,
#else
for (int n = nfrom; n < nto; n += npl) {
#endif
const int i1 = bondlist[n].a;
const int i2 = bondlist[n].b;
const int type = bondlist[n].t;
const int i1 = IP_PRE_dword_index(bondlist[n].a);
const int i2 = IP_PRE_dword_index(bondlist[n].b);
const int type = IP_PRE_dword_index(bondlist[n].t);
const flt_t delx = x[i1].x - x[i2].x;
const flt_t dely = x[i1].y - x[i2].y;

View File

@ -195,11 +195,11 @@ void DihedralCharmmIntel::eval(const int vflag,
for (int n = nfrom; n < nto; n++) {
#endif
for (int n = nfrom; n < nto; n += npl) {
const int i1 = dihedrallist[n].a;
const int i2 = dihedrallist[n].b;
const int i3 = dihedrallist[n].c;
const int i4 = dihedrallist[n].d;
const int type = dihedrallist[n].t;
const int i1 = IP_PRE_dword_index(dihedrallist[n].a);
const int i2 = IP_PRE_dword_index(dihedrallist[n].b);
const int i3 = IP_PRE_dword_index(dihedrallist[n].c);
const int i4 = IP_PRE_dword_index(dihedrallist[n].d);
const int type = IP_PRE_dword_index(dihedrallist[n].t);
// 1st bond

View File

@ -163,11 +163,11 @@ void DihedralHarmonicIntel::eval(const int vflag,
#else
for (int n = nfrom; n < nto; n += npl) {
#endif
const int i1 = dihedrallist[n].a;
const int i2 = dihedrallist[n].b;
const int i3 = dihedrallist[n].c;
const int i4 = dihedrallist[n].d;
const int type = dihedrallist[n].t;
const int i1 = IP_PRE_dword_index(dihedrallist[n].a);
const int i2 = IP_PRE_dword_index(dihedrallist[n].b);
const int i3 = IP_PRE_dword_index(dihedrallist[n].c);
const int i4 = IP_PRE_dword_index(dihedrallist[n].d);
const int type = IP_PRE_dword_index(dihedrallist[n].t);
// 1st bond

View File

@ -167,11 +167,11 @@ void DihedralOPLSIntel::eval(const int vflag,
#else
for (int n = nfrom; n < nto; n += npl) {
#endif
const int i1 = dihedrallist[n].a;
const int i2 = dihedrallist[n].b;
const int i3 = dihedrallist[n].c;
const int i4 = dihedrallist[n].d;
const int type = dihedrallist[n].t;
const int i1 = IP_PRE_dword_index(dihedrallist[n].a);
const int i2 = IP_PRE_dword_index(dihedrallist[n].b);
const int i3 = IP_PRE_dword_index(dihedrallist[n].c);
const int i4 = IP_PRE_dword_index(dihedrallist[n].d);
const int type = IP_PRE_dword_index(dihedrallist[n].t);
// 1st bond

View File

@ -22,6 +22,7 @@
#include "domain.h"
#include "error.h"
#include "force.h"
#include "intel_preprocess.h"
#include "memory.h"
#include "modify.h"
#include "neighbor.h"
@ -100,6 +101,7 @@ void FixNHIntel::remap()
#pragma vector aligned
#endif
for (int i = 0; i < nlocal; i++) {
i = IP_PRE_dword_index(i);
const double d0 = x[i].x - b0;
const double d1 = x[i].y - b1;
const double d2 = x[i].z - b2;
@ -118,6 +120,7 @@ void FixNHIntel::remap()
#endif
for (int i = 0; i < nlocal; i++) {
if (mask[i] & dilate_group_bit) {
i = IP_PRE_dword_index(i);
const double d0 = x[i].x - b0;
const double d1 = x[i].y - b1;
const double d2 = x[i].z - b2;
@ -287,6 +290,7 @@ void FixNHIntel::remap()
#pragma vector aligned
#endif
for (int i = 0; i < nlocal; i++) {
i = IP_PRE_dword_index(i);
x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
x[i].y = h1*x[i].y + h3*x[i].z + nb1;
x[i].z = h2*x[i].z + nb2;
@ -302,6 +306,7 @@ void FixNHIntel::remap()
#endif
for (int i = 0; i < nlocal; i++) {
if (mask[i] & dilate_group_bit) {
i = IP_PRE_dword_index(i);
x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
x[i].y = h1*x[i].y + h3*x[i].z + nb1;
x[i].z = h2*x[i].z + nb2;
@ -432,6 +437,7 @@ void FixNHIntel::nh_v_press()
#pragma vector aligned
#endif
for (int i = 0; i < nlocal; i++) {
i = IP_PRE_dword_index(i);
v[i].x *= f0;
v[i].y *= f1;
v[i].z *= f2;
@ -447,6 +453,7 @@ void FixNHIntel::nh_v_press()
#endif
for (int i = 0; i < nlocal; i++) {
if (mask[i] & groupbit) {
i = IP_PRE_dword_index(i);
v[i].x *= f0;
v[i].y *= f1;
v[i].z *= f2;

View File

@ -168,11 +168,11 @@ void ImproperCvffIntel::eval(const int vflag,
#else
for (int n = nfrom; n < nto; n += npl) {
#endif
const int i1 = improperlist[n].a;
const int i2 = improperlist[n].b;
const int i3 = improperlist[n].c;
const int i4 = improperlist[n].d;
const int type = improperlist[n].t;
const int i1 = IP_PRE_dword_index(improperlist[n].a);
const int i2 = IP_PRE_dword_index(improperlist[n].b);
const int i3 = IP_PRE_dword_index(improperlist[n].c);
const int i4 = IP_PRE_dword_index(improperlist[n].d);
const int type = IP_PRE_dword_index(improperlist[n].t);
// geometry of 4-body

View File

@ -170,11 +170,11 @@ void ImproperHarmonicIntel::eval(const int vflag,
#else
for (int n = nfrom; n < nto; n += npl) {
#endif
const int i1 = improperlist[n].a;
const int i2 = improperlist[n].b;
const int i3 = improperlist[n].c;
const int i4 = improperlist[n].d;
const int type = improperlist[n].t;
const int i1 = IP_PRE_dword_index(improperlist[n].a);
const int i2 = IP_PRE_dword_index(improperlist[n].b);
const int i3 = IP_PRE_dword_index(improperlist[n].c);
const int i4 = IP_PRE_dword_index(improperlist[n].d);
const int type = IP_PRE_dword_index(improperlist[n].t);
// geometry of 4-body

View File

@ -16,10 +16,16 @@
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#include "lmptype.h"
#ifdef __INTEL_LLVM_COMPILER
#define USE_OMP_SIMD
#define __INTEL_COMPILER __INTEL_LLVM_COMPILER
#define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER
// Indicate to vectorizer that it is safe to use dword indexed gather
#define IP_PRE_dword_index(i) ((i) & NEIGHMASK)
#else
#define IP_PRE_dword_index(i) i
#endif
#ifdef __INTEL_COMPILER

View File

@ -370,7 +370,7 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
#pragma vector aligned
#endif
for (int u = 0; u < ncount; u++) {
const int j = tj[u];
const int j = IP_PRE_dword_index(tj[u]);
tx[u] = x[j].x;
ty[u] = x[j].y;
tz[u] = x[j].z;

View File

@ -359,7 +359,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
#pragma vector aligned
#endif
for (int u = 0; u < ncount; u++) {
const int j = tj[u];
const int j = IP_PRE_dword_index(tj[u]);
tx[u] = x[j].x;
ty[u] = x[j].y;
tz[u] = x[j].z;
@ -387,7 +387,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
#endif
#endif
for (int jj = bstart; jj < bend; jj++) {
const int j = binpacked[jj];
const int j = IP_PRE_dword_index(binpacked[jj]);
itj[icount] = j;
itx[icount] = x[j].x;
ity[icount] = x[j].y;

View File

@ -265,7 +265,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z;
const int jtype = x[j].w;
const int jtype = IP_PRE_dword_index(x[j].w);
const flt_t rsq = delx * delx + dely * dely + delz * delz;
const flt_t r = sqrt(rsq);
const flt_t r2inv = (flt_t)1.0 / rsq;

View File

@ -289,7 +289,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z;
const int jtype = x[j].w;
const int jtype = IP_PRE_dword_index(x[j].w);
const flt_t rsq = delx * delx + dely * dely + delz * delz;
if (rsq < c_forcei[jtype].cutsq) {

View File

@ -253,7 +253,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z;
const int jtype = x[j].w;
const int jtype = IP_PRE_dword_index(x[j].w);
const flt_t rsq = delx * delx + dely * dely + delz * delz;
const flt_t r = sqrt(rsq);
const flt_t r2inv = (flt_t)1.0 / rsq;

View File

@ -312,13 +312,13 @@ void PairDPDIntel::eval(const int offload, const int vflag,
sbindex = jlist[jj] >> SBBITS & 3;
j = jlist[jj] & NEIGHMASK;
} else
j = jlist[jj];
j = IP_PRE_dword_index(jlist[jj]);
const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z;
if (!ONETYPE) {
jtype = x[j].w;
jtype = IP_PRE_dword_index(x[j].w);
icut = parami[jtype].icut;
}
const flt_t rsq = delx * delx + dely * dely + delz * delz;

View File

@ -347,14 +347,15 @@ void PairEAMIntel::eval(const int offload, const int vflag,
p = MIN(p,(flt_t)1.0);
if (!ONETYPE)
rhor_joff = rhor_ioff + jtype * jstride;
const int joff = rhor_joff + m;
const int joff = IP_PRE_dword_index(rhor_joff + m);
flt_t ra;
ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p +
rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d;
rhoi += ra;
if (NEWTON_PAIR) {
if (!ONETYPE) {
const int ioff = jtype * istride + itype * jstride + m;
const int ioff = IP_PRE_dword_index(jtype * istride + itype *
jstride + m);
ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p +
rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d;
}
@ -439,7 +440,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
#pragma vector aligned
#endif
for (int ii = iifrom; ii < iito; ++ii) {
const int i = ilist[ii];
const int i = IP_PRE_dword_index(ilist[ii]);
int itype;
if (!ONETYPE) itype = x[i].w;
flt_t p = rho[i]*frdrho + (flt_t)1.0;
@ -448,7 +449,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
p -= m;
p = MIN(p,(flt_t)1.0);
if (!ONETYPE) frho_ioff = itype * fstride;
const int ioff = frho_ioff + m;
const int ioff = IP_PRE_dword_index(frho_ioff + m);
fp_f[i] = (frho_spline_f[ioff].a*p + frho_spline_f[ioff].b)*p +
frho_spline_f[ioff].c;
if (EFLAG) {
@ -553,13 +554,14 @@ void PairEAMIntel::eval(const int offload, const int vflag,
p = MIN(p,(flt_t)1.0);
if (!ONETYPE)
rhor_joff = rhor_ioff + jtype * jstride;
const int joff = rhor_joff + m;
const int joff = IP_PRE_dword_index(rhor_joff + m);
const flt_t rhojp = (rhor_spline_f[joff].a*p +
rhor_spline_f[joff].b)*p +
rhor_spline_f[joff].c;
flt_t rhoip;
if (!ONETYPE) {
const int ioff = jtype * istride + itype * jstride + m;
const int ioff = IP_PRE_dword_index(jtype * istride +
itype * jstride + m);
rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p +
rhor_spline_f[ioff].c;
} else

View File

@ -417,7 +417,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
for (int jj = 0; jj < jnum; jj++) {
int jm = jlist[jj];
int j = jm & NEIGHMASK;
const int jtype = x[j].w;
const int jtype = IP_PRE_dword_index(x[j].w);
if (ijci[jtype].form == ELLIPSE_ELLIPSE) {
flt_t delx = x[j].x-xtmp;
@ -473,7 +473,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
const int sbindex = jlist_form[jj] >> SBBITS & 3;
const int j = jlist_form[jj] & NEIGHMASK;
flt_t factor_lj = special_lj[sbindex];
const int jtype = jtype_form[jj];
const int jtype = IP_PRE_dword_index(jtype_form[jj]);
const flt_t sigma = ijci[jtype].sigma;
const flt_t epsilon = ijci[jtype].epsilon;
const flt_t shape2_0 = ic[jtype].shape2[0];

View File

@ -318,7 +318,7 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
#ifdef INTEL_VMASK
if (rsq < cut_ljsq) {
#endif
const int jtype = tjtype[jj];
const int jtype = IP_PRE_dword_index(tjtype[jj]);
flt_t r6inv = r2inv * r2inv * r2inv;
forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);

View File

@ -324,7 +324,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
const int j = tj[jj] & NEIGHMASK;
const int sbindex = tj[jj] >> SBBITS & 3;
const int jtype = tjtype[jj];
const int jtype = IP_PRE_dword_index(tjtype[jj]);
const flt_t rsq = trsq[jj];
const flt_t r2inv = (flt_t)1.0 / rsq;

View File

@ -287,7 +287,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z;
const int jtype = x[j].w;
const int jtype = IP_PRE_dword_index(x[j].w);
const flt_t rsq = delx * delx + dely * dely + delz * delz;
if (rsq < c_forcei[jtype].cutsq) {
@ -316,8 +316,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
const int j = tj[jj] & NEIGHMASK;
const int sbindex = tj[jj] >> SBBITS & 3;
const int jtype = tjtype[jj];
const int sbindex = IP_PRE_dword_index(tj[jj] >> SBBITS & 3);
const int jtype = IP_PRE_dword_index(tjtype[jj]);
const flt_t rsq = trsq[jj];
const flt_t r2inv = (flt_t)1.0 / rsq;

View File

@ -262,13 +262,13 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
sbindex = jlist[jj] >> SBBITS & 3;
j = jlist[jj] & NEIGHMASK;
} else
j = jlist[jj];
j = IP_PRE_dword_index(jlist[jj]);
const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z;
if (!ONETYPE) {
jtype = x[j].w;
jtype = IP_PRE_dword_index(x[j].w);
cutsq = ljc12oi[jtype].cutsq;
}
const flt_t rsq = delx * delx + dely * dely + delz * delz;

View File

@ -332,7 +332,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
int jtype, ijtype;
if (!ONETYPE) {
jtype = x[j].w;
ijtype = itype_offset + jtype;
ijtype = IP_PRE_dword_index(itype_offset + jtype);
cutsq = p2[ijtype].cutsq;
}
const flt_t rsq1 = delx * delx + dely * dely + delz * delz;
@ -378,7 +378,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
if (EFLAG) fjtmp = (acc_t)0.0;
int ijtype;
if (!ONETYPE) ijtype = tjtype[jj] + itype_offset;
if (!ONETYPE) ijtype = IP_PRE_dword_index(tjtype[jj] + itype_offset);
const flt_t rsq1 = trsq[jj];
const flt_t rinvsq1 = (flt_t)1.0 / rsq1;
@ -459,8 +459,8 @@ void PairSWIntel::eval(const int offload, const int vflag,
int iktype, ijktype;
if (!ONETYPE) {
iktype = tjtype[kk];
ijktype = ijkoff + iktype;
iktype += itype_offset;
ijktype = IP_PRE_dword_index(ijkoff + iktype);
iktype = IP_PRE_dword_index(iktype + itype_offset);
cut = p2[iktype].cut;
sigma_gamma = p2[iktype].sigma_gamma;
costheta = p3[ijktype].costheta;
@ -520,7 +520,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
}
}
} // for kk
const int j = tj[jj];
const int j = IP_PRE_dword_index(tj[jj]);
f[j].x += fjxtmp;
f[j].y += fjytmp;
f[j].z += fjztmp;

View File

@ -403,7 +403,6 @@ void PPPMIntel::particle_map(IntelBuffers<flt_t,acc_t> *buffers)
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// current particle coord can be outside global and local box
// add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
int nx = static_cast<int> ((x[i].x-lo0)*xi+fshift) - OFFSET;
int ny = static_cast<int> ((x[i].y-lo1)*yi+fshift) - OFFSET;
int nz = static_cast<int> ((x[i].z-lo2)*zi+fshift) - OFFSET;
@ -941,6 +940,7 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
#endif
#endif
for (int i = ifrom; i < ito; i++) {
i = IP_PRE_dword_index(i);
particle_ekx[i] *= hx_inv;
particle_eky[i] *= hy_inv;
particle_ekz[i] *= hz_inv;