Converting cilk vectorization directives to the openmp standard and changing some more depracated vector intrinsics. Data alignment directives for compiler vectorization are still mostly intel specific.
This commit is contained in:
@ -162,7 +162,11 @@ void AngleCharmmIntel::eval(const int vflag,
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int n = nfrom; n < nto; n ++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
@ -246,7 +250,11 @@ void AngleCharmmIntel::eval(const int vflag,
|
||||
// apply force to each of 3 atoms
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp ordered simd
|
||||
#else
|
||||
#pragma simdoff
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
|
||||
@ -162,7 +162,11 @@ void AngleHarmonicIntel::eval(const int vflag,
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int n = nfrom; n < nto; n ++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
@ -228,7 +232,11 @@ void AngleHarmonicIntel::eval(const int vflag,
|
||||
// apply force to each of 3 atoms
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp ordered simd
|
||||
#else
|
||||
#pragma simdoff
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
|
||||
@ -158,7 +158,11 @@ void BondFENEIntel::eval(const int vflag,
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int n = nfrom; n < nto; n ++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
@ -215,7 +219,11 @@ void BondFENEIntel::eval(const int vflag,
|
||||
// apply force to each of 2 atoms
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp ordered simd
|
||||
#else
|
||||
#pragma simdoff
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
|
||||
@ -155,7 +155,11 @@ void BondHarmonicIntel::eval(const int vflag,
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int n = nfrom; n < nto; n ++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
@ -184,7 +188,11 @@ void BondHarmonicIntel::eval(const int vflag,
|
||||
|
||||
// apply force to each of 2 atoms
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp ordered simd
|
||||
#else
|
||||
#pragma simdoff
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
|
||||
@ -181,9 +181,16 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER_TEST)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
|
||||
sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, \
|
||||
spv5)
|
||||
#else
|
||||
#pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
|
||||
sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5)
|
||||
sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, \
|
||||
spv5)
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
for (int n = nfrom; n < nto; n++) {
|
||||
#endif
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
@ -329,7 +336,11 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER_TEST)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp ordered simd
|
||||
#else
|
||||
#pragma simdoff
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
@ -408,7 +419,11 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
|
||||
// apply force to each of 4 atoms
|
||||
#if defined(LMP_SIMD_COMPILER_TEST)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp ordered simd
|
||||
#else
|
||||
#pragma simdoff
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
|
||||
@ -154,7 +154,11 @@ void DihedralFourierIntel::eval(const int vflag,
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int n = nfrom; n < nto; n ++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
@ -304,7 +308,11 @@ void DihedralFourierIntel::eval(const int vflag,
|
||||
}
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp ordered simd
|
||||
#else
|
||||
#pragma simdoff
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
|
||||
@ -154,7 +154,11 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int n = nfrom; n < nto; n ++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
@ -299,7 +303,11 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
||||
}
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp ordered simd
|
||||
#else
|
||||
#pragma simdoff
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
|
||||
@ -158,7 +158,11 @@ void DihedralOPLSIntel::eval(const int vflag,
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int n = nfrom; n < nto; n ++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
@ -319,7 +323,11 @@ void DihedralOPLSIntel::eval(const int vflag,
|
||||
}
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp ordered simd
|
||||
#else
|
||||
#pragma simdoff
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
|
||||
@ -635,19 +635,31 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
|
||||
if (_nthreads == 4) {
|
||||
acc_t *f_scalar3 = f_scalar2 + f_stride4;
|
||||
acc_t *f_scalar4 = f_scalar3 + f_stride4;
|
||||
_use_simd_pragma("vector aligned")
|
||||
_use_simd_pragma("simd")
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4:64)
|
||||
#elif defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int n = 0; n < o_range; n++)
|
||||
f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
|
||||
} else if (_nthreads == 2) {
|
||||
_use_simd_pragma("vector aligned")
|
||||
_use_simd_pragma("simd")
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd aligned(f_scalar,f_scalar2:64)
|
||||
#elif defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int n = 0; n < o_range; n++)
|
||||
f_scalar[n] += f_scalar2[n];
|
||||
} else {
|
||||
acc_t *f_scalar3 = f_scalar2 + f_stride4;
|
||||
_use_simd_pragma("vector aligned")
|
||||
_use_simd_pragma("simd")
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3:64)
|
||||
#elif defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int n = 0; n < o_range; n++)
|
||||
f_scalar[n] += f_scalar2[n] + f_scalar3[n];
|
||||
}
|
||||
@ -662,8 +674,12 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
|
||||
|
||||
acc_t *f_scalar2 = f_scalar + f_stride4;
|
||||
for (int t = 1; t < _nthreads; t++) {
|
||||
_use_simd_pragma("vector aligned")
|
||||
_use_simd_pragma("simd")
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd aligned(f_scalar,f_scalar2:64)
|
||||
#elif defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int n = iifrom; n < iito; n++)
|
||||
f_scalar[n] += f_scalar2[n];
|
||||
f_scalar2 += f_stride4;
|
||||
|
||||
@ -99,8 +99,12 @@ void FixNHIntel::remap()
|
||||
|
||||
if (allremap) {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
const double d0 = x[i].x - b0;
|
||||
@ -112,8 +116,12 @@ void FixNHIntel::remap()
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
if (mask[i] & dilate_group_bit) {
|
||||
@ -278,8 +286,12 @@ void FixNHIntel::remap()
|
||||
|
||||
if (allremap) {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
|
||||
@ -288,8 +300,12 @@ void FixNHIntel::remap()
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
if (mask[i] & dilate_group_bit) {
|
||||
@ -415,8 +431,12 @@ void FixNHIntel::nh_v_press()
|
||||
|
||||
if (igroup == 0) {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
v[i].x *= f0;
|
||||
@ -425,8 +445,12 @@ void FixNHIntel::nh_v_press()
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
if (mask[i] & groupbit) {
|
||||
@ -448,8 +472,12 @@ void FixNHIntel::nve_v()
|
||||
double * _noalias const v = atom->v[0];
|
||||
const double * _noalias const f = atom->f[0];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < _nlocal3; i++)
|
||||
v[i] += _dtfm[i] * f[i];
|
||||
@ -468,15 +496,23 @@ void FixNHIntel::nve_x()
|
||||
|
||||
if (igroup == 0) {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < _nlocal3; i++)
|
||||
x[i] += dtv * v[i];
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < _nlocal3; i++) {
|
||||
if (_dtfm[i] != 0.0)
|
||||
@ -500,15 +536,23 @@ void FixNHIntel::nh_v_temp()
|
||||
|
||||
if (igroup == 0) {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < _nlocal3; i++)
|
||||
v[i] *= factor_eta;
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < _nlocal3; i++) {
|
||||
if (_dtfm[i] != 0.0)
|
||||
|
||||
@ -97,8 +97,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
|
||||
dtq = 0.5 * dtv;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < _nlocal3; i++) {
|
||||
v[i] += _dtfm[i] * f[i];
|
||||
@ -108,8 +112,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
|
||||
// update angular momentum by 1/2 step
|
||||
if (igroup == 0) {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
double *quat = bonus[ellipsoid[i]].quat;
|
||||
@ -118,8 +126,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
if (mask[i] & groupbit) {
|
||||
@ -143,8 +155,12 @@ void FixNVEAsphereIntel::final_integrate()
|
||||
const double * _noalias const torque = atom->torque[0];
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < _nlocal3; i++) {
|
||||
v[i] += _dtfm[i] * f[i];
|
||||
|
||||
@ -68,8 +68,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
|
||||
if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) {
|
||||
const double dtfm = dtf / atom->mass[1];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < _nlocal3; i++) {
|
||||
v[i] += dtfm * f[i];
|
||||
@ -78,8 +82,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
|
||||
} else if (igroup == 0) {
|
||||
if (neighbor->ago == 0) reset_dt();
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < _nlocal3; i++) {
|
||||
v[i] += _dtfm[i] * f[i];
|
||||
@ -88,8 +96,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
|
||||
} else {
|
||||
if (neighbor->ago == 0) reset_dt();
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < _nlocal3; i++) {
|
||||
if (_dtfm[i] != 0.0) {
|
||||
@ -112,16 +124,24 @@ void FixNVEIntel::final_integrate()
|
||||
_nlocal3 = 3 * atom->nlocal;
|
||||
const double dtfm = dtf / atom->mass[1];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < _nlocal3; i++)
|
||||
v[i] += dtfm * f[i];
|
||||
} else if (igroup == 0) {
|
||||
if (neighbor->ago == 0) reset_dt();
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < _nlocal3; i++) {
|
||||
v[i] += _dtfm[i] * f[i];
|
||||
@ -129,8 +149,12 @@ void FixNVEIntel::final_integrate()
|
||||
} else {
|
||||
if (neighbor->ago == 0) reset_dt();
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = 0; i < _nlocal3; i++)
|
||||
v[i] += _dtfm[i] * f[i];
|
||||
|
||||
@ -165,7 +165,11 @@ void ImproperCvffIntel::eval(const int vflag,
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int n = nfrom; n < nto; n++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
@ -247,7 +251,11 @@ void ImproperCvffIntel::eval(const int vflag,
|
||||
|
||||
flt_t p, pd;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp ordered simd
|
||||
#else
|
||||
#pragma simdoff
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
if (m == 2) {
|
||||
@ -319,7 +327,11 @@ void ImproperCvffIntel::eval(const int vflag,
|
||||
// apply force to each of 4 atoms
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp ordered simd
|
||||
#else
|
||||
#pragma simdoff
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
|
||||
@ -167,7 +167,11 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int n = nfrom; n < nto; n++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
@ -276,7 +280,11 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
||||
// apply force to each of 4 atoms
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp ordered simd
|
||||
#else
|
||||
#pragma simdoff
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
|
||||
@ -127,7 +127,8 @@ struct vector_ops<double, KNC> {
|
||||
}
|
||||
template<int scale>
|
||||
static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) {
|
||||
return _mm512_mask_i32logather_pd(from, mask, idx, base, scale);
|
||||
return _mm512_mask_i32gather_pd(from, mask, _mm512_castsi512_si256(idx),
|
||||
base, scale);
|
||||
}
|
||||
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
|
||||
return _mm512_mask_blend_pd(mask, a, b);
|
||||
|
||||
@ -511,7 +511,8 @@ public:
|
||||
const int scale) {
|
||||
assert(scale == sizeof(FVEC_SCAL_T));
|
||||
# if FVEC_LEN==8
|
||||
return FVEC_SUFFIX(_mm512_i32logather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
|
||||
return FVEC_SUFFIX(_mm512_i32gather_)(_mm512_castsi512_si256(idx.val_),
|
||||
mem, sizeof(FVEC_SCAL_T));
|
||||
# else
|
||||
return FVEC_SUFFIX(_mm512_i32gather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
|
||||
# endif
|
||||
@ -522,8 +523,8 @@ public:
|
||||
) {
|
||||
assert(scale == sizeof(FVEC_SCAL_T));
|
||||
# if FVEC_LEN==8
|
||||
return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
|
||||
mem, sizeof(FVEC_SCAL_T));
|
||||
return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_,
|
||||
_mm512_castsi512_si256(idx.val_), mem, sizeof(FVEC_SCAL_T));
|
||||
# else
|
||||
return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
|
||||
mem, sizeof(FVEC_SCAL_T));
|
||||
@ -609,8 +610,8 @@ public:
|
||||
) {
|
||||
assert(scale == sizeof(FVEC_SCAL_T));
|
||||
# if FVEC_LEN==8
|
||||
return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
|
||||
mem, sizeof(FVEC_SCAL_T));
|
||||
return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_,
|
||||
_mm512_castsi512_si256(idx.val_), mem, sizeof(FVEC_SCAL_T));
|
||||
# else
|
||||
return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
|
||||
mem, sizeof(FVEC_SCAL_T));
|
||||
@ -622,8 +623,9 @@ public:
|
||||
) {
|
||||
assert(scale == sizeof(FVEC_SCAL_T));
|
||||
# if FVEC_LEN==8
|
||||
FVEC_SUFFIX(_mm512_mask_i32loscatter_)(mem, mask.val_, idx.val_, a.val_,
|
||||
sizeof(FVEC_SCAL_T));
|
||||
FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_,
|
||||
_mm512_castsi512_si256(idx.val_),
|
||||
a.val_, sizeof(FVEC_SCAL_T));
|
||||
# else
|
||||
FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, idx.val_, a.val_,
|
||||
sizeof(FVEC_SCAL_T));
|
||||
@ -666,11 +668,11 @@ public:
|
||||
const double * mem, const int scale
|
||||
) {
|
||||
assert(scale == sizeof(double));
|
||||
__m512d lo = _mm512_mask_i32logather_pd(src.lo_, mask.val_, idx.val_, mem,
|
||||
sizeof(double));
|
||||
__m512d hi = _mm512_mask_i32logather_pd(src.hi_, get_bvec_hi(mask.val_),
|
||||
get_ivec_hi(idx.val_), mem,
|
||||
sizeof(double));
|
||||
__m512d lo = _mm512_mask_i32gather_pd(src.lo_, mask.val_,
|
||||
_mm512_castsi512_si256(idx.val_),
|
||||
mem, sizeof(double));
|
||||
__m512d hi = _mm512_mask_i32gather_pd(src.hi_, get_bvec_hi(mask.val_),
|
||||
_mm512_castsi512_si256(get_ivec_hi(idx.val_)), mem, sizeof(double));
|
||||
return avec16pd(lo, hi);
|
||||
}
|
||||
VEC_INLINE static void mask_i32loscatter(
|
||||
@ -678,10 +680,12 @@ public:
|
||||
const avec16pd &a, const int scale
|
||||
) {
|
||||
assert(scale == sizeof(double));
|
||||
_mm512_mask_i32loscatter_pd(mem, mask.val_, idx.val_, a.lo_,
|
||||
_mm512_mask_i32scatter_pd(mem, mask.val_,
|
||||
_mm512_castsi512_si256(idx.val_), a.lo_,
|
||||
sizeof(double));
|
||||
_mm512_mask_i32loscatter_pd(mem, get_bvec_hi(mask.val_),
|
||||
get_ivec_hi(idx.val_), a.hi_, sizeof(double));
|
||||
_mm512_mask_i32scatter_pd(mem, get_bvec_hi(mask.val_),
|
||||
_mm512_castsi512_si256(get_ivec_hi(idx.val_)),
|
||||
a.hi_, sizeof(double));
|
||||
}
|
||||
|
||||
#define AVEC2_BINOP(the_sym, the_name) \
|
||||
|
||||
@ -17,8 +17,13 @@
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef __INTEL_LLVM_COMPILER
|
||||
#define USE_OMP_SIMD
|
||||
#define __INTEL_COMPILER __INTEL_LLVM_COMPILER
|
||||
#define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER
|
||||
#define _MM_SCALE_1 1
|
||||
#define _MM_SCALE_2 2
|
||||
#define _MM_SCALE_4 4
|
||||
#define _MM_SCALE_8 8
|
||||
#endif
|
||||
|
||||
#ifdef __INTEL_COMPILER
|
||||
@ -332,6 +337,9 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
||||
|
||||
#endif
|
||||
|
||||
// TO BE DEPRECATED
|
||||
#ifndef USE_OMP_SIMD
|
||||
|
||||
#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \
|
||||
f_stride, pos, ov0, ov1, ov2, \
|
||||
ov3, ov4, ov5) \
|
||||
@ -526,6 +534,198 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
||||
} \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \
|
||||
f_stride, pos, ov0, ov1, ov2, \
|
||||
ov3, ov4, ov5) \
|
||||
{ \
|
||||
acc_t *f_scalar = &f_start[0].x; \
|
||||
flt_t *x_scalar = &pos[minlocal].x; \
|
||||
int f_stride4 = f_stride * 4; \
|
||||
_alignvar(acc_t ovv[16],64); \
|
||||
int vwidth; \
|
||||
if (sizeof(acc_t) == sizeof(double)) \
|
||||
vwidth = INTEL_COMPILE_WIDTH/2; \
|
||||
else \
|
||||
vwidth = INTEL_COMPILE_WIDTH; \
|
||||
if (vwidth < 4) vwidth = 4; \
|
||||
_use_simd_pragma("omp simd aligned(ovv:64)") \
|
||||
for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0; \
|
||||
int remainder = lt % vwidth; \
|
||||
if (lf > lt) remainder = 0; \
|
||||
const int v_range = lt - remainder; \
|
||||
if (nthreads == 2) { \
|
||||
acc_t *f_scalar2 = f_scalar + f_stride4; \
|
||||
for (int n = lf; n < v_range; n += vwidth) { \
|
||||
_use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,ovv,x_scalar:64)")\
|
||||
for (int v = 0; v < vwidth; v++) { \
|
||||
f_scalar[n+v] += f_scalar2[n+v]; \
|
||||
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||
} \
|
||||
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||
if (vwidth > 4) { \
|
||||
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
|
||||
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
|
||||
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
|
||||
} \
|
||||
if (vwidth > 8) { \
|
||||
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
|
||||
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
|
||||
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
|
||||
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
|
||||
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
|
||||
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
|
||||
} \
|
||||
} \
|
||||
_use_simd_pragma("vector aligned") \
|
||||
_use_simd_pragma("ivdep") \
|
||||
_use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
|
||||
for (int n = v_range; n < lt; n++) \
|
||||
f_scalar[n] += f_scalar2[n]; \
|
||||
} else if (nthreads==4) { \
|
||||
acc_t *f_scalar2 = f_scalar + f_stride4; \
|
||||
acc_t *f_scalar3 = f_scalar2 + f_stride4; \
|
||||
acc_t *f_scalar4 = f_scalar3 + f_stride4; \
|
||||
for (int n = lf; n < v_range; n += vwidth) { \
|
||||
_use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4,ovv:64)") \
|
||||
for (int v = 0; v < vwidth; v++) { \
|
||||
f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] + \
|
||||
f_scalar4[n+v]; \
|
||||
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||
} \
|
||||
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||
if (vwidth > 4) { \
|
||||
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
|
||||
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
|
||||
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
|
||||
} \
|
||||
if (vwidth > 8) { \
|
||||
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
|
||||
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
|
||||
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
|
||||
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
|
||||
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
|
||||
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
|
||||
} \
|
||||
} \
|
||||
_use_simd_pragma("vector aligned") \
|
||||
_use_simd_pragma("ivdep") \
|
||||
_use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
|
||||
for (int n = v_range; n < lt; n++) \
|
||||
f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; \
|
||||
} else if (nthreads==1) { \
|
||||
for (int n = lf; n < v_range; n += vwidth) { \
|
||||
_use_simd_pragma("omp simd aligned(ovv,f_scalar,x_scalar:64)") \
|
||||
for (int v = 0; v < vwidth; v++) \
|
||||
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||
if (vwidth > 4) { \
|
||||
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
|
||||
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
|
||||
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
|
||||
} \
|
||||
if (vwidth > 8) { \
|
||||
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
|
||||
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
|
||||
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
|
||||
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
|
||||
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
|
||||
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
|
||||
} \
|
||||
} \
|
||||
} else if (nthreads==3) { \
|
||||
acc_t *f_scalar2 = f_scalar + f_stride4; \
|
||||
acc_t *f_scalar3 = f_scalar2 + f_stride4; \
|
||||
for (int n = lf; n < v_range; n += vwidth) { \
|
||||
_use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,ovv,x_scalar:64)") \
|
||||
for (int v = 0; v < vwidth; v++) { \
|
||||
f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v]; \
|
||||
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||
} \
|
||||
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||
if (vwidth > 4) { \
|
||||
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
|
||||
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
|
||||
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
|
||||
} \
|
||||
if (vwidth > 8) { \
|
||||
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
|
||||
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
|
||||
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
|
||||
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
|
||||
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
|
||||
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
|
||||
} \
|
||||
} \
|
||||
_use_simd_pragma("vector aligned") \
|
||||
_use_simd_pragma("ivdep") \
|
||||
_use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
|
||||
for (int n = v_range; n < lt; n++) \
|
||||
f_scalar[n] += f_scalar2[n] + f_scalar3[n]; \
|
||||
} \
|
||||
for (int n = v_range; n < lt; n += 4) { \
|
||||
_use_simd_pragma("vector aligned") \
|
||||
_use_simd_pragma("ivdep") \
|
||||
for (int v = 0; v < 4; v++) \
|
||||
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||
} \
|
||||
ov0 += ovv[0]; \
|
||||
ov1 += ovv[1]; \
|
||||
ov2 += ovv[2]; \
|
||||
if (vwidth > 4) { \
|
||||
ov0 += ovv[4]; \
|
||||
ov1 += ovv[5]; \
|
||||
ov2 += ovv[6]; \
|
||||
} \
|
||||
if (vwidth > 8) { \
|
||||
ov0 += ovv[8] + ovv[12]; \
|
||||
ov1 += ovv[9] + ovv[13]; \
|
||||
ov2 += ovv[10] + ovv[14]; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \
|
||||
f_stride, pos, offload, vflag, ov0, ov1, \
|
||||
ov2, ov3, ov4, ov5) \
|
||||
{ \
|
||||
int o_range = (nall - minlocal) * 4; \
|
||||
IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads, \
|
||||
sizeof(acc_t)); \
|
||||
\
|
||||
acc_t *f_scalar = &f_start[0].x; \
|
||||
int f_stride4 = f_stride * 4; \
|
||||
int t; \
|
||||
if (vflag == VIRIAL_FDOTR) t = 4; else t = 1; \
|
||||
acc_t *f_scalar2 = f_scalar + f_stride4 * t; \
|
||||
for ( ; t < nthreads; t++) { \
|
||||
_use_simd_pragma("omp simd aligned(f_scalar,f_scalar2:64)") \
|
||||
for (int n = iifrom; n < iito; n++) \
|
||||
f_scalar[n] += f_scalar2[n]; \
|
||||
f_scalar2 += f_stride4; \
|
||||
} \
|
||||
\
|
||||
if (vflag == VIRIAL_FDOTR) { \
|
||||
int nt_min = MIN(4,nthreads); \
|
||||
IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start, \
|
||||
f_stride, pos, ov0, ov1, ov2, ov3, ov4, \
|
||||
ov5); \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
#include <sys/time.h>
|
||||
|
||||
|
||||
@ -173,7 +173,7 @@ namespace ip_simd {
|
||||
}
|
||||
|
||||
inline SIMD_double SIMD_gather(const double *p, const SIMD_int &i) {
|
||||
return _mm512_i32logather_pd(i, p, _MM_SCALE_8);
|
||||
return _mm512_i32gather_pd(_mm512_castsi512_si256(i), p, _MM_SCALE_8);
|
||||
}
|
||||
|
||||
inline SIMD_int SIMD_gather(const SIMD_mask &m, const int *p,
|
||||
@ -190,8 +190,8 @@ namespace ip_simd {
|
||||
|
||||
inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p,
|
||||
const SIMD_int &i) {
|
||||
return _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, p,
|
||||
_MM_SCALE_8);
|
||||
return _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i), p, _MM_SCALE_8);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -227,8 +227,8 @@ namespace ip_simd {
|
||||
|
||||
inline SIMD_double SIMD_gatherz(const SIMD_mask &m, const double *p,
|
||||
const SIMD_int &i) {
|
||||
return _mm512_mask_i32logather_pd( _mm512_set1_pd(0.0), m, i, p,
|
||||
_MM_SCALE_8);
|
||||
return _mm512_mask_i32gather_pd( _mm512_set1_pd(0.0), m,
|
||||
_mm512_castsi512_si256(i),p, _MM_SCALE_8);
|
||||
}
|
||||
|
||||
// ------- Store Operations
|
||||
@ -257,7 +257,8 @@ namespace ip_simd {
|
||||
|
||||
inline void SIMD_scatter(const SIMD_mask &m, double *p,
|
||||
const SIMD_int &i, const SIMD_double &vec) {
|
||||
_mm512_mask_i32loscatter_pd(p, m, i, vec, _MM_SCALE_8);
|
||||
_mm512_mask_i32scatter_pd(p, m, _mm512_castsi512_si256(i), vec,
|
||||
_MM_SCALE_8);
|
||||
}
|
||||
|
||||
// ------- Arithmetic Operations
|
||||
@ -834,22 +835,28 @@ namespace ip_simd {
|
||||
inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
|
||||
const SIMD_int &i, SIMD_double &x,
|
||||
SIMD_double &y, SIMD_double &z) {
|
||||
x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
|
||||
x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i), atom,
|
||||
_MM_SCALE_2);
|
||||
y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
|
||||
y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i), atom+1,
|
||||
_MM_SCALE_2);
|
||||
z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
|
||||
z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i), atom+2,
|
||||
_MM_SCALE_2);
|
||||
}
|
||||
|
||||
inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
|
||||
const SIMD_int &i, SIMD_double &x,
|
||||
SIMD_double &y, SIMD_double &z, SIMD_int &type) {
|
||||
x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
|
||||
x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i), atom,
|
||||
_MM_SCALE_2);
|
||||
y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
|
||||
y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i), atom+1,
|
||||
_MM_SCALE_2);
|
||||
z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
|
||||
z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i), atom+2,
|
||||
_MM_SCALE_2);
|
||||
type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3,
|
||||
_MM_SCALE_2);
|
||||
@ -888,10 +895,12 @@ namespace ip_simd {
|
||||
const SIMD_int &joffset, SIMD_double &eng) {
|
||||
SIMD_double jeng;
|
||||
SIMD_conflict_pi_reduce1(rmask, joffset, eng);
|
||||
jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
||||
jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
|
||||
_mm512_castsi512_si256(joffset),
|
||||
force, _MM_SCALE_2);
|
||||
jeng = jeng + eng;
|
||||
_mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
|
||||
_mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
|
||||
jeng, _MM_SCALE_2);
|
||||
}
|
||||
|
||||
inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force,
|
||||
@ -899,20 +908,24 @@ namespace ip_simd {
|
||||
SIMD_double engd, jeng;
|
||||
engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng));
|
||||
SIMD_conflict_pi_reduce1(rmask, joffset, engd);
|
||||
jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
||||
jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
|
||||
_mm512_castsi512_si256(joffset),
|
||||
force, _MM_SCALE_2);
|
||||
jeng = jeng + engd;
|
||||
_mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
|
||||
_mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
|
||||
jeng, _MM_SCALE_2);
|
||||
|
||||
SIMD_mask rmask2 = rmask >> 8;
|
||||
engd = _mm512_cvtps_pd(_mm512_castps512_ps256(
|
||||
_mm512_shuffle_f32x4(eng,eng,238)));
|
||||
SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
|
||||
SIMD_conflict_pi_reduce1(rmask2, joffset2, engd);
|
||||
jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
|
||||
jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
|
||||
_mm512_castsi512_si256(joffset2),
|
||||
force, _MM_SCALE_2);
|
||||
jeng = jeng + engd;
|
||||
_mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jeng, _MM_SCALE_2);
|
||||
_mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2),
|
||||
jeng, _MM_SCALE_2);
|
||||
}
|
||||
|
||||
inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force,
|
||||
@ -926,10 +939,12 @@ namespace ip_simd {
|
||||
|
||||
SIMD_double jeng;
|
||||
SIMD_conflict_pi_reduce1(rmask, joffset, eng);
|
||||
jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
||||
jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
|
||||
_mm512_castsi512_si256(joffset),
|
||||
force, _MM_SCALE_2);
|
||||
jeng = jeng + eng;
|
||||
_mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
|
||||
_mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
|
||||
jeng, _MM_SCALE_2);
|
||||
}
|
||||
|
||||
inline void SIMD_safe_jforce(const SIMD_mask &m, float *force,
|
||||
@ -956,18 +971,24 @@ namespace ip_simd {
|
||||
SIMD_double &fy, SIMD_double &fz) {
|
||||
SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
|
||||
SIMD_double jfrc;
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
|
||||
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i), force,
|
||||
_MM_SCALE_2);
|
||||
jfrc = jfrc + fx;
|
||||
_mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
|
||||
_mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
|
||||
_MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i), force + 1,
|
||||
_MM_SCALE_2);
|
||||
jfrc = jfrc + fy;
|
||||
_mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
|
||||
_mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
|
||||
_MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i), force + 2,
|
||||
_MM_SCALE_2);
|
||||
jfrc = jfrc + fz;
|
||||
_mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
|
||||
_mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
|
||||
_MM_SCALE_2);
|
||||
}
|
||||
|
||||
inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force,
|
||||
@ -979,18 +1000,24 @@ namespace ip_simd {
|
||||
amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz));
|
||||
SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd);
|
||||
SIMD_double jfrc;
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
||||
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
|
||||
_mm512_castsi512_si256(joffset),
|
||||
force, _MM_SCALE_2);
|
||||
jfrc = jfrc + amxd;
|
||||
_mm512_mask_i32loscatter_pd(force, rmask, joffset, jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
||||
_mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
|
||||
jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
|
||||
_mm512_castsi512_si256(joffset),
|
||||
force + 1, _MM_SCALE_2);
|
||||
jfrc = jfrc + amyd;
|
||||
_mm512_mask_i32loscatter_pd(force+1, rmask, joffset, jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
||||
_mm512_mask_i32scatter_pd(force+1, rmask, _mm512_castsi512_si256(joffset),
|
||||
jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
|
||||
_mm512_castsi512_si256(joffset),
|
||||
force + 2, _MM_SCALE_2);
|
||||
jfrc = jfrc + amzd;
|
||||
_mm512_mask_i32loscatter_pd(force+2, rmask, joffset, jfrc, _MM_SCALE_2);
|
||||
_mm512_mask_i32scatter_pd(force+2, rmask, _mm512_castsi512_si256(joffset),
|
||||
jfrc, _MM_SCALE_2);
|
||||
|
||||
SIMD_mask rmask2 = rmask >> 8;
|
||||
amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
|
||||
@ -1001,18 +1028,26 @@ namespace ip_simd {
|
||||
_mm512_shuffle_f32x4(amz,amz,238)));
|
||||
SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
|
||||
SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
|
||||
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
|
||||
_mm512_castsi512_si256(joffset2),
|
||||
force, _MM_SCALE_2);
|
||||
jfrc = jfrc + amxd;
|
||||
_mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
|
||||
_mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2),
|
||||
jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
|
||||
_mm512_castsi512_si256(joffset2),
|
||||
force + 1, _MM_SCALE_2);
|
||||
jfrc = jfrc + amyd;
|
||||
_mm512_mask_i32loscatter_pd(force+1, rmask2, joffset2, jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
|
||||
_mm512_mask_i32scatter_pd(force+1, rmask2,
|
||||
_mm512_castsi512_si256(joffset2), jfrc,
|
||||
_MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
|
||||
_mm512_castsi512_si256(joffset2),
|
||||
force + 2, _MM_SCALE_2);
|
||||
jfrc = jfrc + amzd;
|
||||
_mm512_mask_i32loscatter_pd(force+2, rmask2, joffset2, jfrc, _MM_SCALE_2);
|
||||
_mm512_mask_i32scatter_pd(force+2, rmask2,
|
||||
_mm512_castsi512_si256(joffset2), jfrc,
|
||||
_MM_SCALE_2);
|
||||
}
|
||||
|
||||
inline void SIMD_jforce_update(const SIMD_mask &m, float *force,
|
||||
@ -1064,18 +1099,24 @@ namespace ip_simd {
|
||||
const SIMD_int &i, const SIMD_double &fx,
|
||||
const SIMD_double &fy, const SIMD_double &fz) {
|
||||
SIMD_double jfrc;
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
|
||||
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i), force,
|
||||
_MM_SCALE_2);
|
||||
jfrc = jfrc - fx;
|
||||
_mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
|
||||
_mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
|
||||
_MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i), force + 1,
|
||||
_MM_SCALE_2);
|
||||
jfrc = jfrc - fy;
|
||||
_mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
|
||||
_mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
|
||||
_MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i), force + 2,
|
||||
_MM_SCALE_2);
|
||||
jfrc = jfrc - fz;
|
||||
_mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
|
||||
_mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
|
||||
_MM_SCALE_2);
|
||||
}
|
||||
|
||||
inline void SIMD_jforce_update(const SIMD_mask &rmask,
|
||||
@ -1502,11 +1543,12 @@ namespace ip_simd {
|
||||
fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
|
||||
fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
|
||||
SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
|
||||
SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
|
||||
hmask, k, force + 3,
|
||||
_MM_SCALE_2);
|
||||
SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask,
|
||||
_mm512_castsi512_si256(k),
|
||||
force + 3, _MM_SCALE_2);
|
||||
keng = keng + hevdwl;
|
||||
_mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
|
||||
_mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k),
|
||||
keng, _MM_SCALE_2);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1523,11 +1565,12 @@ namespace ip_simd {
|
||||
fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
|
||||
fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
|
||||
SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
|
||||
SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
|
||||
hmask, k, force + 3,
|
||||
_MM_SCALE_2);
|
||||
SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask,
|
||||
_mm512_castsi512_si256(k),
|
||||
force + 3, _MM_SCALE_2);
|
||||
keng = keng + hevdwl;
|
||||
_mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
|
||||
_mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k),
|
||||
keng, _MM_SCALE_2);
|
||||
}
|
||||
SIMD_mask hmask2 = hmask >> 8;
|
||||
facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(
|
||||
@ -1539,11 +1582,13 @@ namespace ip_simd {
|
||||
fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl);
|
||||
SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238);
|
||||
SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl);
|
||||
SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
|
||||
hmask2, k2, force + 3,
|
||||
_MM_SCALE_2);
|
||||
SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(),
|
||||
hmask2,
|
||||
_mm512_castsi512_si256(k2),
|
||||
force + 3, _MM_SCALE_2);
|
||||
keng = keng + hevdwl;
|
||||
_mm512_mask_i32loscatter_pd(force + 3, hmask2, k2, keng, _MM_SCALE_2);
|
||||
_mm512_mask_i32scatter_pd(force + 3, hmask2, _mm512_castsi512_si256(k2),
|
||||
keng, _MM_SCALE_2);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1815,24 +1860,32 @@ namespace ip_simd {
|
||||
const int EFLAG, const int eatom,
|
||||
const SIMD_double &fwtmp) {
|
||||
SIMD_double jfrc;
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
|
||||
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i), force,
|
||||
_MM_SCALE_2);
|
||||
jfrc = jfrc + fx;
|
||||
_mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
|
||||
_mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
|
||||
_MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i), force + 1,
|
||||
_MM_SCALE_2);
|
||||
jfrc = jfrc + fy;
|
||||
_mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
|
||||
_mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
|
||||
_MM_SCALE_2);
|
||||
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i), force + 2,
|
||||
_MM_SCALE_2);
|
||||
jfrc = jfrc + fz;
|
||||
_mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
|
||||
_mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
|
||||
_MM_SCALE_2);
|
||||
if (EFLAG) {
|
||||
if (eatom) {
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i,
|
||||
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||
_mm512_castsi512_si256(i),
|
||||
force + 3, _MM_SCALE_2);
|
||||
jfrc = jfrc + fwtmp;
|
||||
_mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2);
|
||||
_mm512_mask_i32scatter_pd(force+3, m, _mm512_castsi512_si256(i), jfrc,
|
||||
_MM_SCALE_2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -324,7 +324,11 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
|
||||
const int bstart = binhead[ibin + binstart[k]];
|
||||
const int bend = binhead[ibin + binend[k]];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++)
|
||||
tj[ncount++] = binpacked[jj];
|
||||
@ -345,15 +349,23 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
|
||||
const int bstart = binhead[ibin + stencil[k]];
|
||||
const int bend = binhead[ibin + stencil[k] + 1];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++)
|
||||
tj[ncount++] = binpacked[jj];
|
||||
}
|
||||
} // if i < nlocal
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int u = 0; u < ncount; u++) {
|
||||
const int j = tj[u];
|
||||
@ -425,12 +437,16 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
|
||||
int alln = n;
|
||||
n = 0;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#ifdef LMP_INTEL_NBOR_COMPAT
|
||||
#pragma ivdep
|
||||
#else
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int u = 0; u < alln; u++) {
|
||||
int which;
|
||||
@ -454,12 +470,16 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
|
||||
alln = n2;
|
||||
n2 = maxnbors * 2;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#ifdef LMP_INTEL_NBOR_COMPAT
|
||||
#pragma ivdep
|
||||
#else
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int u = n2; u < alln; u++) {
|
||||
int which;
|
||||
|
||||
@ -344,14 +344,22 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
||||
const int bstart = binhead[ibin + binstart[k]];
|
||||
const int bend = binhead[ibin + binend[k]];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++)
|
||||
tj[ncount++] = binpacked[jj];
|
||||
}
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int u = 0; u < ncount; u++) {
|
||||
const int j = tj[u];
|
||||
@ -375,7 +383,11 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
||||
const int bstart = binhead[ibin];
|
||||
const int bend = binhead[ibin + 1];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++) {
|
||||
const int j = binpacked[jj];
|
||||
@ -533,12 +545,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
||||
|
||||
n = pack_offset;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#ifdef LMP_INTEL_NBOR_COMPAT
|
||||
#pragma ivdep
|
||||
#else
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int u = n; u < alln; u++) {
|
||||
int which;
|
||||
@ -566,12 +582,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
||||
n2 = pack_offset + maxnbors;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#ifdef LMP_INTEL_NBOR_COMPAT
|
||||
#pragma ivdep
|
||||
#else
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int u = n2; u < alln; u++) {
|
||||
int which;
|
||||
@ -737,8 +757,14 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
||||
int jnum = numneigh[i];
|
||||
if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
|
||||
#if __INTEL_COMPILER+0 > 1499
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(max:vlmax,vgmax) \
|
||||
reduction(min:vlmin, vgmin)
|
||||
#else
|
||||
#pragma simd reduction(max:vlmax,vgmax) \
|
||||
reduction(min:vlmin, vgmin)
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
const int j = jlist[jj] & NEIGHMASK;
|
||||
@ -782,8 +808,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
||||
int jnum = numneigh[i];
|
||||
if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
|
||||
int jj = 0;
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
for (jj = 0; jj < jnum; jj++) {
|
||||
const int which = jlist[jj] >> SBBITS & 3;
|
||||
const int j = jlist[jj] & NEIGHMASK;
|
||||
|
||||
@ -248,12 +248,18 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||
if (NEWTON_PAIR == 0)
|
||||
if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
if (vflag == VIRIAL_PAIR)
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
flt_t forcecoul, forcebuck, evdwl, ecoul;
|
||||
|
||||
@ -309,9 +309,14 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int jj = 0; jj < ej; jj++) {
|
||||
flt_t forcecoul, forcebuck, evdwl, ecoul;
|
||||
|
||||
@ -230,12 +230,18 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||
if (NEWTON_PAIR == 0)
|
||||
if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
if (vflag == VIRIAL_PAIR)
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
|
||||
|
||||
@ -289,9 +289,14 @@ void PairDPDIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
flt_t forcelj, evdwl;
|
||||
|
||||
@ -327,8 +327,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:rhoi)
|
||||
#else
|
||||
#pragma simd reduction(+:rhoi)
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int jj = 0; jj < ej; jj++) {
|
||||
int jtype;
|
||||
@ -369,23 +373,35 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
||||
const int rcount = nall;
|
||||
if (nthreads == 2) {
|
||||
double *trho2 = rho + nmax;
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
for (int n = 0; n < rcount; n++)
|
||||
rho[n] += trho2[n];
|
||||
} else if (nthreads == 4) {
|
||||
double *trho2 = rho + nmax;
|
||||
double *trho3 = trho2 + nmax;
|
||||
double *trho4 = trho3 + nmax;
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
for (int n = 0; n < rcount; n++)
|
||||
rho[n] += trho2[n] + trho3[n] + trho4[n];
|
||||
} else {
|
||||
double *trhon = rho + nmax;
|
||||
for (int t = 1; t < nthreads; t++) {
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
for (int n = 0; n < rcount; n++)
|
||||
rho[n] += trhon[n];
|
||||
trhon += nmax;
|
||||
@ -414,8 +430,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
||||
if (EFLAG) tevdwl = (acc_t)0.0;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:tevdwl)
|
||||
#else
|
||||
#pragma simd reduction(+:tevdwl)
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int ii = iifrom; ii < iito; ++ii) {
|
||||
const int i = ilist[ii];
|
||||
@ -510,9 +530,14 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int jj = 0; jj < ej; jj++) {
|
||||
int jtype;
|
||||
|
||||
@ -449,9 +449,14 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
|
||||
#endif
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp, \
|
||||
t3tmp,sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp, \
|
||||
t3tmp,sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
|
||||
sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
|
||||
#endif
|
||||
for (int jj = 0; jj < packed_j; jj++) {
|
||||
flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
|
||||
@ -806,8 +811,12 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
acc_t *f_scalar2 = f_scalar + fst4;
|
||||
for (int t = 1; t < nthreads; t++) {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int n = iifrom * 8; n < sto; n++)
|
||||
f_scalar[n] += f_scalar2[n];
|
||||
|
||||
@ -294,9 +294,14 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int jj = 0; jj < ej; jj++) {
|
||||
flt_t forcecoul, forcelj, evdwl;
|
||||
|
||||
@ -314,9 +314,14 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int jj = 0; jj < ej; jj++) {
|
||||
flt_t forcecoul, forcelj, evdwl, ecoul;
|
||||
|
||||
@ -305,9 +305,14 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#else
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int jj = 0; jj < ej; jj++) {
|
||||
flt_t forcecoul, forcelj, evdwl, ecoul;
|
||||
|
||||
@ -241,9 +241,15 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5) \
|
||||
aligned(jlist,x,ljc12oi,special_lj,f,lj34i:64)
|
||||
#else
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
flt_t forcelj, evdwl;
|
||||
|
||||
@ -371,8 +371,12 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
|
||||
#else
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int jj = 0; jj < ejnum_pad; jj++) {
|
||||
acc_t fjxtmp, fjytmp, fjztmp, fjtmp;
|
||||
|
||||
@ -770,8 +770,12 @@ void PPPMDispIntel::particle_map(double delx, double dely, double delz,
|
||||
IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:flag)
|
||||
#else
|
||||
#pragma simd reduction(+:flag)
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = iifrom; i < iito; i++) {
|
||||
|
||||
@ -876,7 +880,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||
int idz = dz;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||
rho[0][k] = rho_lookup[idx][k];
|
||||
@ -885,7 +893,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = nlower; k <= nupper; k++) {
|
||||
FFT_SCALAR r1,r2,r3;
|
||||
@ -917,8 +929,12 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
int mzy = m*nix + mz;
|
||||
FFT_SCALAR x0 = y0*rho[1][m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mzyx = l + mzy;
|
||||
@ -939,7 +955,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
for (int j = 1; j < nthr; j++) {
|
||||
@ -1025,7 +1045,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||
int idz = dz;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||
rho[0][k] = rho6_lookup[idx][k];
|
||||
@ -1034,7 +1058,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||
FFT_SCALAR r1,r2,r3;
|
||||
@ -1067,8 +1095,12 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
int mzy = m*nix + mz;
|
||||
FFT_SCALAR x0 = y0*rho[1][m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mzyx = l + mzy;
|
||||
@ -1089,7 +1121,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr);
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
for (int j = 1; j < nthr; j++) {
|
||||
@ -1173,7 +1209,11 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||
int idz = dz;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||
rho[0][k] = rho6_lookup[idx][k];
|
||||
@ -1182,7 +1222,11 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||
FFT_SCALAR r1,r2,r3;
|
||||
@ -1215,8 +1259,12 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
int my = m + nysum;
|
||||
FFT_SCALAR x0 = y0*rho[1][m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l + nxsum;
|
||||
@ -1307,7 +1355,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||
int idz = dz;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||
rho[0][k] = rho6_lookup[idx][k];
|
||||
@ -1316,7 +1368,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||
FFT_SCALAR r1,r2,r3;
|
||||
@ -1349,8 +1405,12 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
int mzy = m*nix + mz;
|
||||
FFT_SCALAR x0 = y0*rho[1][m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mzyx = l + mzy;
|
||||
@ -1373,7 +1433,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr);
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
for (int j = 1; j < nthr; j++) {
|
||||
@ -1454,7 +1518,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||
int idz = dz;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||
rho0[k] = rho_lookup[idx][k];
|
||||
@ -1463,7 +1531,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = nlower; k <= nupper; k++) {
|
||||
FFT_SCALAR r1 = rho_coeff[order-1][k];
|
||||
@ -1498,8 +1570,12 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
int my = m+nysum;
|
||||
FFT_SCALAR y0 = z0*rho1[m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l+nxsum;
|
||||
@ -1624,7 +1700,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
int idz = dz;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||
rho[0][k] = rho_lookup[idx][k];
|
||||
@ -1636,7 +1716,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = nlower; k <= nupper; k++) {
|
||||
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
||||
@ -1680,8 +1764,12 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
||||
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l + nxsum;
|
||||
@ -1702,7 +1790,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
}
|
||||
}
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
particle_ekx[i] *= hx_inv;
|
||||
@ -1802,7 +1894,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||
int idz = dz;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||
rho0[k] = rho6_lookup[idx][k];
|
||||
@ -1811,7 +1907,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||
FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
|
||||
@ -1846,8 +1946,12 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
int my = m+nysum;
|
||||
FFT_SCALAR y0 = z0*rho1[m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l+nxsum;
|
||||
@ -1967,7 +2071,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
int idz = dz;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||
rho[0][k] = rho6_lookup[idx][k];
|
||||
@ -1979,7 +2087,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
||||
@ -2023,8 +2135,12 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
||||
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l + nxsum;
|
||||
@ -2045,7 +2161,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
}
|
||||
}
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
particle_ekx[i] *= hx_inv;
|
||||
@ -2143,7 +2263,11 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||
int idz = dz;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||
rho0[k] = rho6_lookup[idx][k];
|
||||
@ -2152,7 +2276,11 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||
FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
|
||||
@ -2206,8 +2334,12 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
int my = m+nysum;
|
||||
FFT_SCALAR y0 = z0*rho1[m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l+nxsum;
|
||||
@ -2398,7 +2530,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
int idz = dz;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||
rho[0][k] = rho6_lookup[idx][k];
|
||||
@ -2410,7 +2546,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
||||
@ -2479,8 +2619,12 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
||||
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l + nxsum;
|
||||
@ -2541,7 +2685,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
}
|
||||
}
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
particle_ekx0[i] *= hx_inv;
|
||||
@ -2671,7 +2819,11 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||
int idz = dz;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||
rho0[k] = rho6_lookup[idx][k];
|
||||
@ -2680,7 +2832,11 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||
FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
|
||||
@ -2721,8 +2877,12 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
int my = m+nysum;
|
||||
FFT_SCALAR y0 = z0*rho1[m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l+nxsum;
|
||||
@ -2848,7 +3008,11 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
int idz = dz;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||
rho[0][k] = rho6_lookup[idx][k];
|
||||
@ -2860,7 +3024,11 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
||||
@ -2909,8 +3077,12 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
||||
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
||||
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l + nxsum;
|
||||
@ -2992,7 +3164,11 @@ void PPPMDispIntel::precompute_rho()
|
||||
for (int i = 0; i < rho_points; i++) {
|
||||
FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k=nlower; k<=nupper;k++) {
|
||||
FFT_SCALAR r1 = ZEROF;
|
||||
@ -3006,7 +3182,11 @@ void PPPMDispIntel::precompute_rho()
|
||||
}
|
||||
if (differentiation_flag == 1) {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k=nlower; k<=nupper;k++) {
|
||||
FFT_SCALAR r1 = ZEROF;
|
||||
@ -3026,7 +3206,11 @@ void PPPMDispIntel::precompute_rho()
|
||||
for (int i = 0; i < rho_points; i++) {
|
||||
FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k=nlower_6; k<=nupper_6;k++) {
|
||||
FFT_SCALAR r1 = ZEROF;
|
||||
@ -3040,7 +3224,11 @@ void PPPMDispIntel::precompute_rho()
|
||||
}
|
||||
if (differentiation_flag == 1) {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k=nlower_6; k<=nupper_6;k++) {
|
||||
FFT_SCALAR r1 = ZEROF;
|
||||
|
||||
@ -394,8 +394,12 @@ void PPPMIntel::particle_map(IntelBuffers<flt_t,acc_t> *buffers)
|
||||
IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd reduction(+:flag)
|
||||
#else
|
||||
#pragma simd reduction(+:flag)
|
||||
#endif
|
||||
#pragma vector aligned
|
||||
#endif
|
||||
for (int i = iifrom; i < iito; i++) {
|
||||
|
||||
@ -500,7 +504,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
|
||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||
int idz = dz;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||
rho[0][k] = rho_lookup[idx][k];
|
||||
@ -509,7 +517,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = nlower; k <= nupper; k++) {
|
||||
FFT_SCALAR r1,r2,r3;
|
||||
@ -541,7 +553,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
|
||||
int mzy = m*nix + mz;
|
||||
FFT_SCALAR x0 = y0*rho[1][m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
int mzyx = l + mzy;
|
||||
@ -563,7 +579,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
|
||||
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
for (int j = 1; j < nthr; j++) {
|
||||
@ -645,7 +665,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||
int idz = dz;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||
rho0[k] = rho_lookup[idx][k];
|
||||
@ -654,7 +678,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = nlower; k <= nupper; k++) {
|
||||
FFT_SCALAR r1 = rho_coeff[order-1][k];
|
||||
@ -690,7 +718,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
||||
int my = m+nysum;
|
||||
FFT_SCALAR y0 = z0*rho1[m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
int mx = l+nxsum;
|
||||
@ -813,7 +845,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||
int idz = dz;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||
rho[0][k] = rho_lookup[idx][k];
|
||||
@ -825,7 +861,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k = nlower; k <= nupper; k++) {
|
||||
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
||||
@ -871,7 +911,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
||||
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
||||
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
int mx = l + nxsum;
|
||||
@ -893,7 +937,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
particle_ekx[i] *= hx_inv;
|
||||
@ -942,7 +990,11 @@ void PPPMIntel::precompute_rho()
|
||||
for (int i = 0; i < rho_points; i++) {
|
||||
FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k=nlower; k<=nupper;k++) {
|
||||
FFT_SCALAR r1 = ZEROF;
|
||||
@ -956,7 +1008,11 @@ void PPPMIntel::precompute_rho()
|
||||
}
|
||||
if (differentiation_flag == 1) {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#if defined(USE_OMP_SIMD)
|
||||
#pragma omp simd
|
||||
#else
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int k=nlower; k<=nupper;k++) {
|
||||
FFT_SCALAR r1 = ZEROF;
|
||||
|
||||
Reference in New Issue
Block a user