Converting cilk vectorization directives to the openmp standard and changing some more depracated vector intrinsics. Data alignment directives for compiler vectorization are still mostly intel specific.

This commit is contained in:
Mike Brown
2021-07-26 11:22:21 -07:00
parent 1345c25f41
commit fefcd0e2af
33 changed files with 1013 additions and 189 deletions

View File

@ -162,7 +162,11 @@ void AngleCharmmIntel::eval(const int vflag,
if (VFLAG && vflag) {
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
}
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int n = nfrom; n < nto; n ++) {
#else
for (int n = nfrom; n < nto; n += npl) {
@ -246,7 +250,11 @@ void AngleCharmmIntel::eval(const int vflag,
// apply force to each of 3 atoms
#ifdef LMP_INTEL_USE_SIMDOFF
#if defined(USE_OMP_SIMD)
#pragma omp ordered simd
#else
#pragma simdoff
#endif
#endif
{
if (NEWTON_BOND || i1 < nlocal) {

View File

@ -162,7 +162,11 @@ void AngleHarmonicIntel::eval(const int vflag,
if (VFLAG && vflag) {
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
}
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int n = nfrom; n < nto; n ++) {
#else
for (int n = nfrom; n < nto; n += npl) {
@ -228,7 +232,11 @@ void AngleHarmonicIntel::eval(const int vflag,
// apply force to each of 3 atoms
#ifdef LMP_INTEL_USE_SIMDOFF
#if defined(USE_OMP_SIMD)
#pragma omp ordered simd
#else
#pragma simdoff
#endif
#endif
{
if (NEWTON_BOND || i1 < nlocal) {

View File

@ -158,7 +158,11 @@ void BondFENEIntel::eval(const int vflag,
if (VFLAG && vflag) {
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
}
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int n = nfrom; n < nto; n ++) {
#else
for (int n = nfrom; n < nto; n += npl) {
@ -215,7 +219,11 @@ void BondFENEIntel::eval(const int vflag,
// apply force to each of 2 atoms
#ifdef LMP_INTEL_USE_SIMDOFF
#if defined(USE_OMP_SIMD)
#pragma omp ordered simd
#else
#pragma simdoff
#endif
#endif
{
if (NEWTON_BOND || i1 < nlocal) {

View File

@ -155,7 +155,11 @@ void BondHarmonicIntel::eval(const int vflag,
if (VFLAG && vflag) {
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
}
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int n = nfrom; n < nto; n ++) {
#else
for (int n = nfrom; n < nto; n += npl) {
@ -184,7 +188,11 @@ void BondHarmonicIntel::eval(const int vflag,
// apply force to each of 2 atoms
#ifdef LMP_INTEL_USE_SIMDOFF
#if defined(USE_OMP_SIMD)
#pragma omp ordered simd
#else
#pragma simdoff
#endif
#endif
{
if (NEWTON_BOND || i1 < nlocal) {

View File

@ -181,9 +181,16 @@ void DihedralCharmmIntel::eval(const int vflag,
}
#if defined(LMP_SIMD_COMPILER_TEST)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, \
spv5)
#else
#pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5)
sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, \
spv5)
#endif
#pragma vector aligned
for (int n = nfrom; n < nto; n++) {
#endif
for (int n = nfrom; n < nto; n += npl) {
@ -329,7 +336,11 @@ void DihedralCharmmIntel::eval(const int vflag,
#if defined(LMP_SIMD_COMPILER_TEST)
#if defined(USE_OMP_SIMD)
#pragma omp ordered simd
#else
#pragma simdoff
#endif
#endif
{
if (NEWTON_BOND || i2 < nlocal) {
@ -408,7 +419,11 @@ void DihedralCharmmIntel::eval(const int vflag,
// apply force to each of 4 atoms
#if defined(LMP_SIMD_COMPILER_TEST)
#if defined(USE_OMP_SIMD)
#pragma omp ordered simd
#else
#pragma simdoff
#endif
#endif
{
if (NEWTON_BOND || i1 < nlocal) {

View File

@ -154,7 +154,11 @@ void DihedralFourierIntel::eval(const int vflag,
if (VFLAG && vflag) {
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
}
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int n = nfrom; n < nto; n ++) {
#else
for (int n = nfrom; n < nto; n += npl) {
@ -304,7 +308,11 @@ void DihedralFourierIntel::eval(const int vflag,
}
#ifdef LMP_INTEL_USE_SIMDOFF
#if defined(USE_OMP_SIMD)
#pragma omp ordered simd
#else
#pragma simdoff
#endif
#endif
{
if (NEWTON_BOND || i1 < nlocal) {

View File

@ -154,7 +154,11 @@ void DihedralHarmonicIntel::eval(const int vflag,
if (VFLAG && vflag) {
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
}
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int n = nfrom; n < nto; n ++) {
#else
for (int n = nfrom; n < nto; n += npl) {
@ -299,7 +303,11 @@ void DihedralHarmonicIntel::eval(const int vflag,
}
#ifdef LMP_INTEL_USE_SIMDOFF
#if defined(USE_OMP_SIMD)
#pragma omp ordered simd
#else
#pragma simdoff
#endif
#endif
{
if (NEWTON_BOND || i1 < nlocal) {

View File

@ -158,7 +158,11 @@ void DihedralOPLSIntel::eval(const int vflag,
if (VFLAG && vflag) {
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
}
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int n = nfrom; n < nto; n ++) {
#else
for (int n = nfrom; n < nto; n += npl) {
@ -319,7 +323,11 @@ void DihedralOPLSIntel::eval(const int vflag,
}
#ifdef LMP_INTEL_USE_SIMDOFF
#if defined(USE_OMP_SIMD)
#pragma omp ordered simd
#else
#pragma simdoff
#endif
#endif
{
if (NEWTON_BOND || i1 < nlocal) {

View File

@ -635,19 +635,31 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
if (_nthreads == 4) {
acc_t *f_scalar3 = f_scalar2 + f_stride4;
acc_t *f_scalar4 = f_scalar3 + f_stride4;
_use_simd_pragma("vector aligned")
_use_simd_pragma("simd")
#if defined(USE_OMP_SIMD)
#pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4:64)
#elif defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int n = 0; n < o_range; n++)
f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
} else if (_nthreads == 2) {
_use_simd_pragma("vector aligned")
_use_simd_pragma("simd")
#if defined(USE_OMP_SIMD)
#pragma omp simd aligned(f_scalar,f_scalar2:64)
#elif defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int n = 0; n < o_range; n++)
f_scalar[n] += f_scalar2[n];
} else {
acc_t *f_scalar3 = f_scalar2 + f_stride4;
_use_simd_pragma("vector aligned")
_use_simd_pragma("simd")
#if defined(USE_OMP_SIMD)
#pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3:64)
#elif defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int n = 0; n < o_range; n++)
f_scalar[n] += f_scalar2[n] + f_scalar3[n];
}
@ -662,8 +674,12 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
acc_t *f_scalar2 = f_scalar + f_stride4;
for (int t = 1; t < _nthreads; t++) {
_use_simd_pragma("vector aligned")
_use_simd_pragma("simd")
#if defined(USE_OMP_SIMD)
#pragma omp simd aligned(f_scalar,f_scalar2:64)
#elif defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int n = iifrom; n < iito; n++)
f_scalar[n] += f_scalar2[n];
f_scalar2 += f_stride4;

View File

@ -99,8 +99,12 @@ void FixNHIntel::remap()
if (allremap) {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < nlocal; i++) {
const double d0 = x[i].x - b0;
@ -112,8 +116,12 @@ void FixNHIntel::remap()
}
} else {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < nlocal; i++) {
if (mask[i] & dilate_group_bit) {
@ -278,8 +286,12 @@ void FixNHIntel::remap()
if (allremap) {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < nlocal; i++) {
x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
@ -288,8 +300,12 @@ void FixNHIntel::remap()
}
} else {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < nlocal; i++) {
if (mask[i] & dilate_group_bit) {
@ -415,8 +431,12 @@ void FixNHIntel::nh_v_press()
if (igroup == 0) {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < nlocal; i++) {
v[i].x *= f0;
@ -425,8 +445,12 @@ void FixNHIntel::nh_v_press()
}
} else {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < nlocal; i++) {
if (mask[i] & groupbit) {
@ -448,8 +472,12 @@ void FixNHIntel::nve_v()
double * _noalias const v = atom->v[0];
const double * _noalias const f = atom->f[0];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < _nlocal3; i++)
v[i] += _dtfm[i] * f[i];
@ -468,15 +496,23 @@ void FixNHIntel::nve_x()
if (igroup == 0) {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < _nlocal3; i++)
x[i] += dtv * v[i];
} else {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < _nlocal3; i++) {
if (_dtfm[i] != 0.0)
@ -500,15 +536,23 @@ void FixNHIntel::nh_v_temp()
if (igroup == 0) {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < _nlocal3; i++)
v[i] *= factor_eta;
} else {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < _nlocal3; i++) {
if (_dtfm[i] != 0.0)

View File

@ -97,8 +97,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
dtq = 0.5 * dtv;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < _nlocal3; i++) {
v[i] += _dtfm[i] * f[i];
@ -108,8 +112,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
// update angular momentum by 1/2 step
if (igroup == 0) {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < nlocal; i++) {
double *quat = bonus[ellipsoid[i]].quat;
@ -118,8 +126,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
}
} else {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < nlocal; i++) {
if (mask[i] & groupbit) {
@ -143,8 +155,12 @@ void FixNVEAsphereIntel::final_integrate()
const double * _noalias const torque = atom->torque[0];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < _nlocal3; i++) {
v[i] += _dtfm[i] * f[i];

View File

@ -68,8 +68,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) {
const double dtfm = dtf / atom->mass[1];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < _nlocal3; i++) {
v[i] += dtfm * f[i];
@ -78,8 +82,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
} else if (igroup == 0) {
if (neighbor->ago == 0) reset_dt();
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < _nlocal3; i++) {
v[i] += _dtfm[i] * f[i];
@ -88,8 +96,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
} else {
if (neighbor->ago == 0) reset_dt();
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < _nlocal3; i++) {
if (_dtfm[i] != 0.0) {
@ -112,16 +124,24 @@ void FixNVEIntel::final_integrate()
_nlocal3 = 3 * atom->nlocal;
const double dtfm = dtf / atom->mass[1];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < _nlocal3; i++)
v[i] += dtfm * f[i];
} else if (igroup == 0) {
if (neighbor->ago == 0) reset_dt();
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < _nlocal3; i++) {
v[i] += _dtfm[i] * f[i];
@ -129,8 +149,12 @@ void FixNVEIntel::final_integrate()
} else {
if (neighbor->ago == 0) reset_dt();
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int i = 0; i < _nlocal3; i++)
v[i] += _dtfm[i] * f[i];

View File

@ -165,7 +165,11 @@ void ImproperCvffIntel::eval(const int vflag,
if (VFLAG && vflag) {
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
}
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int n = nfrom; n < nto; n++) {
#else
for (int n = nfrom; n < nto; n += npl) {
@ -247,7 +251,11 @@ void ImproperCvffIntel::eval(const int vflag,
flt_t p, pd;
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
#if defined(USE_OMP_SIMD)
#pragma omp ordered simd
#else
#pragma simdoff
#endif
#endif
{
if (m == 2) {
@ -319,7 +327,11 @@ void ImproperCvffIntel::eval(const int vflag,
// apply force to each of 4 atoms
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
#if defined(USE_OMP_SIMD)
#pragma omp ordered simd
#else
#pragma simdoff
#endif
#endif
{
if (NEWTON_BOND || i1 < nlocal) {

View File

@ -167,7 +167,11 @@ void ImproperHarmonicIntel::eval(const int vflag,
if (VFLAG && vflag) {
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
}
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int n = nfrom; n < nto; n++) {
#else
for (int n = nfrom; n < nto; n += npl) {
@ -276,7 +280,11 @@ void ImproperHarmonicIntel::eval(const int vflag,
// apply force to each of 4 atoms
#ifdef LMP_INTEL_USE_SIMDOFF
#if defined(USE_OMP_SIMD)
#pragma omp ordered simd
#else
#pragma simdoff
#endif
#endif
{
if (NEWTON_BOND || i1 < nlocal) {

View File

@ -127,7 +127,8 @@ struct vector_ops<double, KNC> {
}
template<int scale>
static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) {
return _mm512_mask_i32logather_pd(from, mask, idx, base, scale);
return _mm512_mask_i32gather_pd(from, mask, _mm512_castsi512_si256(idx),
base, scale);
}
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
return _mm512_mask_blend_pd(mask, a, b);

View File

@ -511,7 +511,8 @@ public:
const int scale) {
assert(scale == sizeof(FVEC_SCAL_T));
# if FVEC_LEN==8
return FVEC_SUFFIX(_mm512_i32logather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
return FVEC_SUFFIX(_mm512_i32gather_)(_mm512_castsi512_si256(idx.val_),
mem, sizeof(FVEC_SCAL_T));
# else
return FVEC_SUFFIX(_mm512_i32gather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
# endif
@ -522,8 +523,8 @@ public:
) {
assert(scale == sizeof(FVEC_SCAL_T));
# if FVEC_LEN==8
return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
mem, sizeof(FVEC_SCAL_T));
return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_,
_mm512_castsi512_si256(idx.val_), mem, sizeof(FVEC_SCAL_T));
# else
return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
mem, sizeof(FVEC_SCAL_T));
@ -609,8 +610,8 @@ public:
) {
assert(scale == sizeof(FVEC_SCAL_T));
# if FVEC_LEN==8
return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
mem, sizeof(FVEC_SCAL_T));
return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_,
_mm512_castsi512_si256(idx.val_), mem, sizeof(FVEC_SCAL_T));
# else
return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
mem, sizeof(FVEC_SCAL_T));
@ -622,8 +623,9 @@ public:
) {
assert(scale == sizeof(FVEC_SCAL_T));
# if FVEC_LEN==8
FVEC_SUFFIX(_mm512_mask_i32loscatter_)(mem, mask.val_, idx.val_, a.val_,
sizeof(FVEC_SCAL_T));
FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_,
_mm512_castsi512_si256(idx.val_),
a.val_, sizeof(FVEC_SCAL_T));
# else
FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, idx.val_, a.val_,
sizeof(FVEC_SCAL_T));
@ -666,11 +668,11 @@ public:
const double * mem, const int scale
) {
assert(scale == sizeof(double));
__m512d lo = _mm512_mask_i32logather_pd(src.lo_, mask.val_, idx.val_, mem,
sizeof(double));
__m512d hi = _mm512_mask_i32logather_pd(src.hi_, get_bvec_hi(mask.val_),
get_ivec_hi(idx.val_), mem,
sizeof(double));
__m512d lo = _mm512_mask_i32gather_pd(src.lo_, mask.val_,
_mm512_castsi512_si256(idx.val_),
mem, sizeof(double));
__m512d hi = _mm512_mask_i32gather_pd(src.hi_, get_bvec_hi(mask.val_),
_mm512_castsi512_si256(get_ivec_hi(idx.val_)), mem, sizeof(double));
return avec16pd(lo, hi);
}
VEC_INLINE static void mask_i32loscatter(
@ -678,10 +680,12 @@ public:
const avec16pd &a, const int scale
) {
assert(scale == sizeof(double));
_mm512_mask_i32loscatter_pd(mem, mask.val_, idx.val_, a.lo_,
_mm512_mask_i32scatter_pd(mem, mask.val_,
_mm512_castsi512_si256(idx.val_), a.lo_,
sizeof(double));
_mm512_mask_i32loscatter_pd(mem, get_bvec_hi(mask.val_),
get_ivec_hi(idx.val_), a.hi_, sizeof(double));
_mm512_mask_i32scatter_pd(mem, get_bvec_hi(mask.val_),
_mm512_castsi512_si256(get_ivec_hi(idx.val_)),
a.hi_, sizeof(double));
}
#define AVEC2_BINOP(the_sym, the_name) \

View File

@ -17,8 +17,13 @@
------------------------------------------------------------------------- */
#ifdef __INTEL_LLVM_COMPILER
#define USE_OMP_SIMD
#define __INTEL_COMPILER __INTEL_LLVM_COMPILER
#define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER
#define _MM_SCALE_1 1
#define _MM_SCALE_2 2
#define _MM_SCALE_4 4
#define _MM_SCALE_8 8
#endif
#ifdef __INTEL_COMPILER
@ -332,6 +337,9 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
#endif
// TO BE DEPRECATED
#ifndef USE_OMP_SIMD
#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \
f_stride, pos, ov0, ov1, ov2, \
ov3, ov4, ov5) \
@ -526,6 +534,198 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
} \
}
#else
#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \
f_stride, pos, ov0, ov1, ov2, \
ov3, ov4, ov5) \
{ \
acc_t *f_scalar = &f_start[0].x; \
flt_t *x_scalar = &pos[minlocal].x; \
int f_stride4 = f_stride * 4; \
_alignvar(acc_t ovv[16],64); \
int vwidth; \
if (sizeof(acc_t) == sizeof(double)) \
vwidth = INTEL_COMPILE_WIDTH/2; \
else \
vwidth = INTEL_COMPILE_WIDTH; \
if (vwidth < 4) vwidth = 4; \
_use_simd_pragma("omp simd aligned(ovv:64)") \
for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0; \
int remainder = lt % vwidth; \
if (lf > lt) remainder = 0; \
const int v_range = lt - remainder; \
if (nthreads == 2) { \
acc_t *f_scalar2 = f_scalar + f_stride4; \
for (int n = lf; n < v_range; n += vwidth) { \
_use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,ovv,x_scalar:64)")\
for (int v = 0; v < vwidth; v++) { \
f_scalar[n+v] += f_scalar2[n+v]; \
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
} \
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
if (vwidth > 4) { \
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
} \
if (vwidth > 8) { \
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
} \
} \
_use_simd_pragma("vector aligned") \
_use_simd_pragma("ivdep") \
_use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
for (int n = v_range; n < lt; n++) \
f_scalar[n] += f_scalar2[n]; \
} else if (nthreads==4) { \
acc_t *f_scalar2 = f_scalar + f_stride4; \
acc_t *f_scalar3 = f_scalar2 + f_stride4; \
acc_t *f_scalar4 = f_scalar3 + f_stride4; \
for (int n = lf; n < v_range; n += vwidth) { \
_use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4,ovv:64)") \
for (int v = 0; v < vwidth; v++) { \
f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] + \
f_scalar4[n+v]; \
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
} \
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
if (vwidth > 4) { \
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
} \
if (vwidth > 8) { \
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
} \
} \
_use_simd_pragma("vector aligned") \
_use_simd_pragma("ivdep") \
_use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
for (int n = v_range; n < lt; n++) \
f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; \
} else if (nthreads==1) { \
for (int n = lf; n < v_range; n += vwidth) { \
_use_simd_pragma("omp simd aligned(ovv,f_scalar,x_scalar:64)") \
for (int v = 0; v < vwidth; v++) \
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
if (vwidth > 4) { \
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
} \
if (vwidth > 8) { \
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
} \
} \
} else if (nthreads==3) { \
acc_t *f_scalar2 = f_scalar + f_stride4; \
acc_t *f_scalar3 = f_scalar2 + f_stride4; \
for (int n = lf; n < v_range; n += vwidth) { \
_use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,ovv,x_scalar:64)") \
for (int v = 0; v < vwidth; v++) { \
f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v]; \
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
} \
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
if (vwidth > 4) { \
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
} \
if (vwidth > 8) { \
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
} \
} \
_use_simd_pragma("vector aligned") \
_use_simd_pragma("ivdep") \
_use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
for (int n = v_range; n < lt; n++) \
f_scalar[n] += f_scalar2[n] + f_scalar3[n]; \
} \
for (int n = v_range; n < lt; n += 4) { \
_use_simd_pragma("vector aligned") \
_use_simd_pragma("ivdep") \
for (int v = 0; v < 4; v++) \
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
} \
ov0 += ovv[0]; \
ov1 += ovv[1]; \
ov2 += ovv[2]; \
if (vwidth > 4) { \
ov0 += ovv[4]; \
ov1 += ovv[5]; \
ov2 += ovv[6]; \
} \
if (vwidth > 8) { \
ov0 += ovv[8] + ovv[12]; \
ov1 += ovv[9] + ovv[13]; \
ov2 += ovv[10] + ovv[14]; \
} \
}
#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \
f_stride, pos, offload, vflag, ov0, ov1, \
ov2, ov3, ov4, ov5) \
{ \
int o_range = (nall - minlocal) * 4; \
IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads, \
sizeof(acc_t)); \
\
acc_t *f_scalar = &f_start[0].x; \
int f_stride4 = f_stride * 4; \
int t; \
if (vflag == VIRIAL_FDOTR) t = 4; else t = 1; \
acc_t *f_scalar2 = f_scalar + f_stride4 * t; \
for ( ; t < nthreads; t++) { \
_use_simd_pragma("omp simd aligned(f_scalar,f_scalar2:64)") \
for (int n = iifrom; n < iito; n++) \
f_scalar[n] += f_scalar2[n]; \
f_scalar2 += f_stride4; \
} \
\
if (vflag == VIRIAL_FDOTR) { \
int nt_min = MIN(4,nthreads); \
IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start, \
f_stride, pos, ov0, ov1, ov2, ov3, ov4, \
ov5); \
} \
}
#endif
#ifdef _LMP_INTEL_OFFLOAD
#include <sys/time.h>

View File

@ -173,7 +173,7 @@ namespace ip_simd {
}
inline SIMD_double SIMD_gather(const double *p, const SIMD_int &i) {
return _mm512_i32logather_pd(i, p, _MM_SCALE_8);
return _mm512_i32gather_pd(_mm512_castsi512_si256(i), p, _MM_SCALE_8);
}
inline SIMD_int SIMD_gather(const SIMD_mask &m, const int *p,
@ -190,8 +190,8 @@ namespace ip_simd {
inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p,
const SIMD_int &i) {
return _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, p,
_MM_SCALE_8);
return _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i), p, _MM_SCALE_8);
}
template <typename T>
@ -227,8 +227,8 @@ namespace ip_simd {
inline SIMD_double SIMD_gatherz(const SIMD_mask &m, const double *p,
const SIMD_int &i) {
return _mm512_mask_i32logather_pd( _mm512_set1_pd(0.0), m, i, p,
_MM_SCALE_8);
return _mm512_mask_i32gather_pd( _mm512_set1_pd(0.0), m,
_mm512_castsi512_si256(i),p, _MM_SCALE_8);
}
// ------- Store Operations
@ -257,7 +257,8 @@ namespace ip_simd {
inline void SIMD_scatter(const SIMD_mask &m, double *p,
const SIMD_int &i, const SIMD_double &vec) {
_mm512_mask_i32loscatter_pd(p, m, i, vec, _MM_SCALE_8);
_mm512_mask_i32scatter_pd(p, m, _mm512_castsi512_si256(i), vec,
_MM_SCALE_8);
}
// ------- Arithmetic Operations
@ -834,22 +835,28 @@ namespace ip_simd {
inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
const SIMD_int &i, SIMD_double &x,
SIMD_double &y, SIMD_double &z) {
x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i), atom,
_MM_SCALE_2);
y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i), atom+1,
_MM_SCALE_2);
z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i), atom+2,
_MM_SCALE_2);
}
inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
const SIMD_int &i, SIMD_double &x,
SIMD_double &y, SIMD_double &z, SIMD_int &type) {
x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i), atom,
_MM_SCALE_2);
y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i), atom+1,
_MM_SCALE_2);
z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i), atom+2,
_MM_SCALE_2);
type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3,
_MM_SCALE_2);
@ -888,10 +895,12 @@ namespace ip_simd {
const SIMD_int &joffset, SIMD_double &eng) {
SIMD_double jeng;
SIMD_conflict_pi_reduce1(rmask, joffset, eng);
jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
_mm512_castsi512_si256(joffset),
force, _MM_SCALE_2);
jeng = jeng + eng;
_mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
_mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
jeng, _MM_SCALE_2);
}
inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force,
@ -899,20 +908,24 @@ namespace ip_simd {
SIMD_double engd, jeng;
engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng));
SIMD_conflict_pi_reduce1(rmask, joffset, engd);
jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
_mm512_castsi512_si256(joffset),
force, _MM_SCALE_2);
jeng = jeng + engd;
_mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
_mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
jeng, _MM_SCALE_2);
SIMD_mask rmask2 = rmask >> 8;
engd = _mm512_cvtps_pd(_mm512_castps512_ps256(
_mm512_shuffle_f32x4(eng,eng,238)));
SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
SIMD_conflict_pi_reduce1(rmask2, joffset2, engd);
jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
_mm512_castsi512_si256(joffset2),
force, _MM_SCALE_2);
jeng = jeng + engd;
_mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jeng, _MM_SCALE_2);
_mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2),
jeng, _MM_SCALE_2);
}
inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force,
@ -926,10 +939,12 @@ namespace ip_simd {
SIMD_double jeng;
SIMD_conflict_pi_reduce1(rmask, joffset, eng);
jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
_mm512_castsi512_si256(joffset),
force, _MM_SCALE_2);
jeng = jeng + eng;
_mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
_mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
jeng, _MM_SCALE_2);
}
inline void SIMD_safe_jforce(const SIMD_mask &m, float *force,
@ -956,18 +971,24 @@ namespace ip_simd {
SIMD_double &fy, SIMD_double &fz) {
SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
SIMD_double jfrc;
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i), force,
_MM_SCALE_2);
jfrc = jfrc + fx;
_mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
_mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
_MM_SCALE_2);
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i), force + 1,
_MM_SCALE_2);
jfrc = jfrc + fy;
_mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
_mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
_MM_SCALE_2);
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i), force + 2,
_MM_SCALE_2);
jfrc = jfrc + fz;
_mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
_mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
_MM_SCALE_2);
}
inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force,
@ -979,18 +1000,24 @@ namespace ip_simd {
amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz));
SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd);
SIMD_double jfrc;
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
_mm512_castsi512_si256(joffset),
force, _MM_SCALE_2);
jfrc = jfrc + amxd;
_mm512_mask_i32loscatter_pd(force, rmask, joffset, jfrc, _MM_SCALE_2);
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
_mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
jfrc, _MM_SCALE_2);
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
_mm512_castsi512_si256(joffset),
force + 1, _MM_SCALE_2);
jfrc = jfrc + amyd;
_mm512_mask_i32loscatter_pd(force+1, rmask, joffset, jfrc, _MM_SCALE_2);
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
_mm512_mask_i32scatter_pd(force+1, rmask, _mm512_castsi512_si256(joffset),
jfrc, _MM_SCALE_2);
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
_mm512_castsi512_si256(joffset),
force + 2, _MM_SCALE_2);
jfrc = jfrc + amzd;
_mm512_mask_i32loscatter_pd(force+2, rmask, joffset, jfrc, _MM_SCALE_2);
_mm512_mask_i32scatter_pd(force+2, rmask, _mm512_castsi512_si256(joffset),
jfrc, _MM_SCALE_2);
SIMD_mask rmask2 = rmask >> 8;
amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
@ -1001,18 +1028,26 @@ namespace ip_simd {
_mm512_shuffle_f32x4(amz,amz,238)));
SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd);
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
_mm512_castsi512_si256(joffset2),
force, _MM_SCALE_2);
jfrc = jfrc + amxd;
_mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jfrc, _MM_SCALE_2);
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
_mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2),
jfrc, _MM_SCALE_2);
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
_mm512_castsi512_si256(joffset2),
force + 1, _MM_SCALE_2);
jfrc = jfrc + amyd;
_mm512_mask_i32loscatter_pd(force+1, rmask2, joffset2, jfrc, _MM_SCALE_2);
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
_mm512_mask_i32scatter_pd(force+1, rmask2,
_mm512_castsi512_si256(joffset2), jfrc,
_MM_SCALE_2);
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
_mm512_castsi512_si256(joffset2),
force + 2, _MM_SCALE_2);
jfrc = jfrc + amzd;
_mm512_mask_i32loscatter_pd(force+2, rmask2, joffset2, jfrc, _MM_SCALE_2);
_mm512_mask_i32scatter_pd(force+2, rmask2,
_mm512_castsi512_si256(joffset2), jfrc,
_MM_SCALE_2);
}
inline void SIMD_jforce_update(const SIMD_mask &m, float *force,
@ -1064,18 +1099,24 @@ namespace ip_simd {
const SIMD_int &i, const SIMD_double &fx,
const SIMD_double &fy, const SIMD_double &fz) {
SIMD_double jfrc;
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i), force,
_MM_SCALE_2);
jfrc = jfrc - fx;
_mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
_mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
_MM_SCALE_2);
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i), force + 1,
_MM_SCALE_2);
jfrc = jfrc - fy;
_mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
_mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
_MM_SCALE_2);
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i), force + 2,
_MM_SCALE_2);
jfrc = jfrc - fz;
_mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
_mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
_MM_SCALE_2);
}
inline void SIMD_jforce_update(const SIMD_mask &rmask,
@ -1502,11 +1543,12 @@ namespace ip_simd {
fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
hmask, k, force + 3,
_MM_SCALE_2);
SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask,
_mm512_castsi512_si256(k),
force + 3, _MM_SCALE_2);
keng = keng + hevdwl;
_mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
_mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k),
keng, _MM_SCALE_2);
}
}
@ -1523,11 +1565,12 @@ namespace ip_simd {
fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
hmask, k, force + 3,
_MM_SCALE_2);
SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask,
_mm512_castsi512_si256(k),
force + 3, _MM_SCALE_2);
keng = keng + hevdwl;
_mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
_mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k),
keng, _MM_SCALE_2);
}
SIMD_mask hmask2 = hmask >> 8;
facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(
@ -1539,11 +1582,13 @@ namespace ip_simd {
fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl);
SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238);
SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl);
SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
hmask2, k2, force + 3,
_MM_SCALE_2);
SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(),
hmask2,
_mm512_castsi512_si256(k2),
force + 3, _MM_SCALE_2);
keng = keng + hevdwl;
_mm512_mask_i32loscatter_pd(force + 3, hmask2, k2, keng, _MM_SCALE_2);
_mm512_mask_i32scatter_pd(force + 3, hmask2, _mm512_castsi512_si256(k2),
keng, _MM_SCALE_2);
}
}
@ -1815,24 +1860,32 @@ namespace ip_simd {
const int EFLAG, const int eatom,
const SIMD_double &fwtmp) {
SIMD_double jfrc;
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i), force,
_MM_SCALE_2);
jfrc = jfrc + fx;
_mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
_mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
_MM_SCALE_2);
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i), force + 1,
_MM_SCALE_2);
jfrc = jfrc + fy;
_mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
_mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
_MM_SCALE_2);
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i), force + 2,
_MM_SCALE_2);
jfrc = jfrc + fz;
_mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
_mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
_MM_SCALE_2);
if (EFLAG) {
if (eatom) {
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i,
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
_mm512_castsi512_si256(i),
force + 3, _MM_SCALE_2);
jfrc = jfrc + fwtmp;
_mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2);
_mm512_mask_i32scatter_pd(force+3, m, _mm512_castsi512_si256(i), jfrc,
_MM_SCALE_2);
}
}
}

View File

@ -324,7 +324,11 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
const int bstart = binhead[ibin + binstart[k]];
const int bend = binhead[ibin + binend[k]];
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int jj = bstart; jj < bend; jj++)
tj[ncount++] = binpacked[jj];
@ -345,15 +349,23 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
const int bstart = binhead[ibin + stencil[k]];
const int bend = binhead[ibin + stencil[k] + 1];
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int jj = bstart; jj < bend; jj++)
tj[ncount++] = binpacked[jj];
}
} // if i < nlocal
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int u = 0; u < ncount; u++) {
const int j = tj[u];
@ -425,12 +437,16 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
int alln = n;
n = 0;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#ifdef LMP_INTEL_NBOR_COMPAT
#pragma ivdep
#else
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
#pragma vector aligned
#endif
for (int u = 0; u < alln; u++) {
int which;
@ -454,12 +470,16 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
alln = n2;
n2 = maxnbors * 2;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#ifdef LMP_INTEL_NBOR_COMPAT
#pragma ivdep
#else
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
#pragma vector aligned
#endif
for (int u = n2; u < alln; u++) {
int which;

View File

@ -344,14 +344,22 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
const int bstart = binhead[ibin + binstart[k]];
const int bend = binhead[ibin + binend[k]];
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int jj = bstart; jj < bend; jj++)
tj[ncount++] = binpacked[jj];
}
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int u = 0; u < ncount; u++) {
const int j = tj[u];
@ -375,7 +383,11 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
const int bstart = binhead[ibin];
const int bend = binhead[ibin + 1];
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int jj = bstart; jj < bend; jj++) {
const int j = binpacked[jj];
@ -533,12 +545,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
n = pack_offset;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#ifdef LMP_INTEL_NBOR_COMPAT
#pragma ivdep
#else
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
#pragma vector aligned
#endif
for (int u = n; u < alln; u++) {
int which;
@ -566,12 +582,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
n2 = pack_offset + maxnbors;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#ifdef LMP_INTEL_NBOR_COMPAT
#pragma ivdep
#else
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
#pragma vector aligned
#endif
for (int u = n2; u < alln; u++) {
int which;
@ -737,8 +757,14 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
int jnum = numneigh[i];
if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
#if __INTEL_COMPILER+0 > 1499
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(max:vlmax,vgmax) \
reduction(min:vlmin, vgmin)
#else
#pragma simd reduction(max:vlmax,vgmax) \
reduction(min:vlmin, vgmin)
#endif
#pragma vector aligned
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
#endif
for (int jj = 0; jj < jnum; jj++) {
const int j = jlist[jj] & NEIGHMASK;
@ -782,8 +808,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
int jnum = numneigh[i];
if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
int jj = 0;
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
for (jj = 0; jj < jnum; jj++) {
const int which = jlist[jj] >> SBBITS & 3;
const int j = jlist[jj] & NEIGHMASK;

View File

@ -248,12 +248,18 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
fxtmp = fytmp = fztmp = (acc_t)0;
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
if (NEWTON_PAIR == 0)
if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
if (vflag == VIRIAL_PAIR)
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif
#pragma vector aligned
#endif
for (int jj = 0; jj < jnum; jj++) {
flt_t forcecoul, forcebuck, evdwl, ecoul;

View File

@ -309,9 +309,14 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
}
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
#endif
#pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int jj = 0; jj < ej; jj++) {
flt_t forcecoul, forcebuck, evdwl, ecoul;

View File

@ -230,12 +230,18 @@ void PairBuckIntel::eval(const int offload, const int vflag,
fxtmp = fytmp = fztmp = (acc_t)0;
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
if (NEWTON_PAIR == 0)
if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
if (vflag == VIRIAL_PAIR)
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif
#pragma vector aligned
#endif
for (int jj = 0; jj < jnum; jj++) {

View File

@ -289,9 +289,14 @@ void PairDPDIntel::eval(const int offload, const int vflag,
}
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif
#pragma vector aligned
#endif
for (int jj = 0; jj < jnum; jj++) {
flt_t forcelj, evdwl;

View File

@ -327,8 +327,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
}
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:rhoi)
#else
#pragma simd reduction(+:rhoi)
#endif
#pragma vector aligned
#endif
for (int jj = 0; jj < ej; jj++) {
int jtype;
@ -369,23 +373,35 @@ void PairEAMIntel::eval(const int offload, const int vflag,
const int rcount = nall;
if (nthreads == 2) {
double *trho2 = rho + nmax;
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
for (int n = 0; n < rcount; n++)
rho[n] += trho2[n];
} else if (nthreads == 4) {
double *trho2 = rho + nmax;
double *trho3 = trho2 + nmax;
double *trho4 = trho3 + nmax;
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
for (int n = 0; n < rcount; n++)
rho[n] += trho2[n] + trho3[n] + trho4[n];
} else {
double *trhon = rho + nmax;
for (int t = 1; t < nthreads; t++) {
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
for (int n = 0; n < rcount; n++)
rho[n] += trhon[n];
trhon += nmax;
@ -414,8 +430,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
if (EFLAG) tevdwl = (acc_t)0.0;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:tevdwl)
#else
#pragma simd reduction(+:tevdwl)
#endif
#pragma vector aligned
#endif
for (int ii = iifrom; ii < iito; ++ii) {
const int i = ilist[ii];
@ -510,9 +530,14 @@ void PairEAMIntel::eval(const int offload, const int vflag,
}
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif
#pragma vector aligned
#endif
for (int jj = 0; jj < ej; jj++) {
int jtype;

View File

@ -449,9 +449,14 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
#endif
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp, \
t3tmp,sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
#else
#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp, \
t3tmp,sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
#endif
#pragma vector aligned
#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
#endif
for (int jj = 0; jj < packed_j; jj++) {
flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
@ -806,8 +811,12 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
acc_t *f_scalar2 = f_scalar + fst4;
for (int t = 1; t < nthreads; t++) {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma vector aligned
#endif
for (int n = iifrom * 8; n < sto; n++)
f_scalar[n] += f_scalar2[n];

View File

@ -294,9 +294,14 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
}
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
#endif
#pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int jj = 0; jj < ej; jj++) {
flt_t forcecoul, forcelj, evdwl;

View File

@ -314,9 +314,14 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
}
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
#endif
#pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int jj = 0; jj < ej; jj++) {
flt_t forcecoul, forcelj, evdwl, ecoul;

View File

@ -305,9 +305,14 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
}
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
#else
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
#endif
#pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int jj = 0; jj < ej; jj++) {
flt_t forcecoul, forcelj, evdwl, ecoul;

View File

@ -241,9 +241,15 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5) \
aligned(jlist,x,ljc12oi,special_lj,f,lj34i:64)
#else
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#pragma vector aligned
#endif
#endif
for (int jj = 0; jj < jnum; jj++) {
flt_t forcelj, evdwl;

View File

@ -371,8 +371,12 @@ void PairSWIntel::eval(const int offload, const int vflag,
}
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
#else
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
#endif
#pragma vector aligned
#endif
for (int jj = 0; jj < ejnum_pad; jj++) {
acc_t fjxtmp, fjytmp, fjztmp, fjtmp;

View File

@ -770,8 +770,12 @@ void PPPMDispIntel::particle_map(double delx, double dely, double delz,
IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:flag)
#else
#pragma simd reduction(+:flag)
#endif
#pragma vector aligned
#endif
for (int i = iifrom; i < iito; i++) {
@ -876,7 +880,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
dz = dz*half_rho_scale + half_rho_scale_plus;
int idz = dz;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho[0][k] = rho_lookup[idx][k];
@ -885,7 +893,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
}
} else {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = nlower; k <= nupper; k++) {
FFT_SCALAR r1,r2,r3;
@ -917,8 +929,12 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
int mzy = m*nix + mz;
FFT_SCALAR x0 = y0*rho[1][m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < order; l++) {
int mzyx = l + mzy;
@ -939,7 +955,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int i = ifrom; i < ito; i++) {
for (int j = 1; j < nthr; j++) {
@ -1025,7 +1045,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
dz = dz*half_rho_scale + half_rho_scale_plus;
int idz = dz;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho[0][k] = rho6_lookup[idx][k];
@ -1034,7 +1058,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
}
} else {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = nlower_6; k <= nupper_6; k++) {
FFT_SCALAR r1,r2,r3;
@ -1067,8 +1095,12 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
int mzy = m*nix + mz;
FFT_SCALAR x0 = y0*rho[1][m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < order; l++) {
int mzyx = l + mzy;
@ -1089,7 +1121,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr);
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int i = ifrom; i < ito; i++) {
for (int j = 1; j < nthr; j++) {
@ -1173,7 +1209,11 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
dz = dz*half_rho_scale + half_rho_scale_plus;
int idz = dz;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho[0][k] = rho6_lookup[idx][k];
@ -1182,7 +1222,11 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
}
} else {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = nlower_6; k <= nupper_6; k++) {
FFT_SCALAR r1,r2,r3;
@ -1215,8 +1259,12 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
int my = m + nysum;
FFT_SCALAR x0 = y0*rho[1][m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < order; l++) {
int mx = l + nxsum;
@ -1307,7 +1355,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
dz = dz*half_rho_scale + half_rho_scale_plus;
int idz = dz;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho[0][k] = rho6_lookup[idx][k];
@ -1316,7 +1368,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
}
} else {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = nlower_6; k <= nupper_6; k++) {
FFT_SCALAR r1,r2,r3;
@ -1349,8 +1405,12 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
int mzy = m*nix + mz;
FFT_SCALAR x0 = y0*rho[1][m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < order; l++) {
int mzyx = l + mzy;
@ -1373,7 +1433,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr);
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int i = ifrom; i < ito; i++) {
for (int j = 1; j < nthr; j++) {
@ -1454,7 +1518,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
dz = dz*half_rho_scale + half_rho_scale_plus;
int idz = dz;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho0[k] = rho_lookup[idx][k];
@ -1463,7 +1531,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
}
} else {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = nlower; k <= nupper; k++) {
FFT_SCALAR r1 = rho_coeff[order-1][k];
@ -1498,8 +1570,12 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
int my = m+nysum;
FFT_SCALAR y0 = z0*rho1[m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < order; l++) {
int mx = l+nxsum;
@ -1624,7 +1700,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
int idz = dz;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho[0][k] = rho_lookup[idx][k];
@ -1636,7 +1716,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
}
} else {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = nlower; k <= nupper; k++) {
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@ -1680,8 +1764,12 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < order; l++) {
int mx = l + nxsum;
@ -1702,7 +1790,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
}
}
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int i = ifrom; i < ito; i++) {
particle_ekx[i] *= hx_inv;
@ -1802,7 +1894,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
dz = dz*half_rho_scale + half_rho_scale_plus;
int idz = dz;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho0[k] = rho6_lookup[idx][k];
@ -1811,7 +1907,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
}
} else {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = nlower_6; k <= nupper_6; k++) {
FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
@ -1846,8 +1946,12 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
int my = m+nysum;
FFT_SCALAR y0 = z0*rho1[m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < order; l++) {
int mx = l+nxsum;
@ -1967,7 +2071,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
int idz = dz;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho[0][k] = rho6_lookup[idx][k];
@ -1979,7 +2087,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
}
} else {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = nlower_6; k <= nupper_6; k++) {
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@ -2023,8 +2135,12 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < order; l++) {
int mx = l + nxsum;
@ -2045,7 +2161,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
}
}
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int i = ifrom; i < ito; i++) {
particle_ekx[i] *= hx_inv;
@ -2143,7 +2263,11 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
dz = dz*half_rho_scale + half_rho_scale_plus;
int idz = dz;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho0[k] = rho6_lookup[idx][k];
@ -2152,7 +2276,11 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
}
} else {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = nlower_6; k <= nupper_6; k++) {
FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
@ -2206,8 +2334,12 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
int my = m+nysum;
FFT_SCALAR y0 = z0*rho1[m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < order; l++) {
int mx = l+nxsum;
@ -2398,7 +2530,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
int idz = dz;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho[0][k] = rho6_lookup[idx][k];
@ -2410,7 +2546,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
}
} else {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = nlower_6; k <= nupper_6; k++) {
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@ -2479,8 +2619,12 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < order; l++) {
int mx = l + nxsum;
@ -2541,7 +2685,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
}
}
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int i = ifrom; i < ito; i++) {
particle_ekx0[i] *= hx_inv;
@ -2671,7 +2819,11 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
dz = dz*half_rho_scale + half_rho_scale_plus;
int idz = dz;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho0[k] = rho6_lookup[idx][k];
@ -2680,7 +2832,11 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
}
} else {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = nlower_6; k <= nupper_6; k++) {
FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
@ -2721,8 +2877,12 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
int my = m+nysum;
FFT_SCALAR y0 = z0*rho1[m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < order; l++) {
int mx = l+nxsum;
@ -2848,7 +3008,11 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
int idz = dz;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho[0][k] = rho6_lookup[idx][k];
@ -2860,7 +3024,11 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
}
} else {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = nlower_6; k <= nupper_6; k++) {
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@ -2909,8 +3077,12 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < order; l++) {
int mx = l + nxsum;
@ -2992,7 +3164,11 @@ void PPPMDispIntel::precompute_rho()
for (int i = 0; i < rho_points; i++) {
FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k=nlower; k<=nupper;k++) {
FFT_SCALAR r1 = ZEROF;
@ -3006,7 +3182,11 @@ void PPPMDispIntel::precompute_rho()
}
if (differentiation_flag == 1) {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k=nlower; k<=nupper;k++) {
FFT_SCALAR r1 = ZEROF;
@ -3026,7 +3206,11 @@ void PPPMDispIntel::precompute_rho()
for (int i = 0; i < rho_points; i++) {
FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k=nlower_6; k<=nupper_6;k++) {
FFT_SCALAR r1 = ZEROF;
@ -3040,7 +3224,11 @@ void PPPMDispIntel::precompute_rho()
}
if (differentiation_flag == 1) {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k=nlower_6; k<=nupper_6;k++) {
FFT_SCALAR r1 = ZEROF;

View File

@ -394,8 +394,12 @@ void PPPMIntel::particle_map(IntelBuffers<flt_t,acc_t> *buffers)
IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#if defined(USE_OMP_SIMD)
#pragma omp simd reduction(+:flag)
#else
#pragma simd reduction(+:flag)
#endif
#pragma vector aligned
#endif
for (int i = iifrom; i < iito; i++) {
@ -500,7 +504,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
dz = dz*half_rho_scale + half_rho_scale_plus;
int idz = dz;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho[0][k] = rho_lookup[idx][k];
@ -509,7 +517,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
}
} else {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = nlower; k <= nupper; k++) {
FFT_SCALAR r1,r2,r3;
@ -541,7 +553,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
int mzy = m*nix + mz;
FFT_SCALAR x0 = y0*rho[1][m];
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
int mzyx = l + mzy;
@ -563,7 +579,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int i = ifrom; i < ito; i++) {
for (int j = 1; j < nthr; j++) {
@ -645,7 +665,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
dz = dz*half_rho_scale + half_rho_scale_plus;
int idz = dz;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho0[k] = rho_lookup[idx][k];
@ -654,7 +678,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
}
} else {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = nlower; k <= nupper; k++) {
FFT_SCALAR r1 = rho_coeff[order-1][k];
@ -690,7 +718,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
int my = m+nysum;
FFT_SCALAR y0 = z0*rho1[m];
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
int mx = l+nxsum;
@ -813,7 +845,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
dz = dz*half_rho_scale + half_rho_scale_plus;
int idz = dz;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho[0][k] = rho_lookup[idx][k];
@ -825,7 +861,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
}
} else {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k = nlower; k <= nupper; k++) {
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
@ -871,7 +911,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
int mx = l + nxsum;
@ -893,7 +937,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
}
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int i = ifrom; i < ito; i++) {
particle_ekx[i] *= hx_inv;
@ -942,7 +990,11 @@ void PPPMIntel::precompute_rho()
for (int i = 0; i < rho_points; i++) {
FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k=nlower; k<=nupper;k++) {
FFT_SCALAR r1 = ZEROF;
@ -956,7 +1008,11 @@ void PPPMIntel::precompute_rho()
}
if (differentiation_flag == 1) {
#if defined(LMP_SIMD_COMPILER)
#if defined(USE_OMP_SIMD)
#pragma omp simd
#else
#pragma simd
#endif
#endif
for (int k=nlower; k<=nupper;k++) {
FFT_SCALAR r1 = ZEROF;