Converting cilk vectorization directives to the openmp standard and changing some more depracated vector intrinsics. Data alignment directives for compiler vectorization are still mostly intel specific.
This commit is contained in:
@ -162,7 +162,11 @@ void AngleCharmmIntel::eval(const int vflag,
|
|||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
}
|
}
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
|
#pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
for (int n = nfrom; n < nto; n ++) {
|
for (int n = nfrom; n < nto; n ++) {
|
||||||
#else
|
#else
|
||||||
for (int n = nfrom; n < nto; n += npl) {
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
@ -246,7 +250,11 @@ void AngleCharmmIntel::eval(const int vflag,
|
|||||||
// apply force to each of 3 atoms
|
// apply force to each of 3 atoms
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp ordered simd
|
||||||
|
#else
|
||||||
#pragma simdoff
|
#pragma simdoff
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
|
|||||||
@ -162,7 +162,11 @@ void AngleHarmonicIntel::eval(const int vflag,
|
|||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
}
|
}
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
|
#pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
for (int n = nfrom; n < nto; n ++) {
|
for (int n = nfrom; n < nto; n ++) {
|
||||||
#else
|
#else
|
||||||
for (int n = nfrom; n < nto; n += npl) {
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
@ -228,7 +232,11 @@ void AngleHarmonicIntel::eval(const int vflag,
|
|||||||
// apply force to each of 3 atoms
|
// apply force to each of 3 atoms
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp ordered simd
|
||||||
|
#else
|
||||||
#pragma simdoff
|
#pragma simdoff
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
|
|||||||
@ -158,7 +158,11 @@ void BondFENEIntel::eval(const int vflag,
|
|||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
}
|
}
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
|
#pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
for (int n = nfrom; n < nto; n ++) {
|
for (int n = nfrom; n < nto; n ++) {
|
||||||
#else
|
#else
|
||||||
for (int n = nfrom; n < nto; n += npl) {
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
@ -215,7 +219,11 @@ void BondFENEIntel::eval(const int vflag,
|
|||||||
// apply force to each of 2 atoms
|
// apply force to each of 2 atoms
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp ordered simd
|
||||||
|
#else
|
||||||
#pragma simdoff
|
#pragma simdoff
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
|
|||||||
@ -155,7 +155,11 @@ void BondHarmonicIntel::eval(const int vflag,
|
|||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
}
|
}
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
|
#pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
for (int n = nfrom; n < nto; n ++) {
|
for (int n = nfrom; n < nto; n ++) {
|
||||||
#else
|
#else
|
||||||
for (int n = nfrom; n < nto; n += npl) {
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
@ -184,7 +188,11 @@ void BondHarmonicIntel::eval(const int vflag,
|
|||||||
|
|
||||||
// apply force to each of 2 atoms
|
// apply force to each of 2 atoms
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp ordered simd
|
||||||
|
#else
|
||||||
#pragma simdoff
|
#pragma simdoff
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
|
|||||||
@ -181,9 +181,16 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER_TEST)
|
#if defined(LMP_SIMD_COMPILER_TEST)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
|
||||||
|
sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, \
|
||||||
|
spv5)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
|
#pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
|
||||||
sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5)
|
sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, \
|
||||||
|
spv5)
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
for (int n = nfrom; n < nto; n++) {
|
for (int n = nfrom; n < nto; n++) {
|
||||||
#endif
|
#endif
|
||||||
for (int n = nfrom; n < nto; n += npl) {
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
@ -329,7 +336,11 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
|
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER_TEST)
|
#if defined(LMP_SIMD_COMPILER_TEST)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp ordered simd
|
||||||
|
#else
|
||||||
#pragma simdoff
|
#pragma simdoff
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i2 < nlocal) {
|
if (NEWTON_BOND || i2 < nlocal) {
|
||||||
@ -408,7 +419,11 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
|
|
||||||
// apply force to each of 4 atoms
|
// apply force to each of 4 atoms
|
||||||
#if defined(LMP_SIMD_COMPILER_TEST)
|
#if defined(LMP_SIMD_COMPILER_TEST)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp ordered simd
|
||||||
|
#else
|
||||||
#pragma simdoff
|
#pragma simdoff
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
|
|||||||
@ -154,7 +154,11 @@ void DihedralFourierIntel::eval(const int vflag,
|
|||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
}
|
}
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
for (int n = nfrom; n < nto; n ++) {
|
for (int n = nfrom; n < nto; n ++) {
|
||||||
#else
|
#else
|
||||||
for (int n = nfrom; n < nto; n += npl) {
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
@ -304,7 +308,11 @@ void DihedralFourierIntel::eval(const int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp ordered simd
|
||||||
|
#else
|
||||||
#pragma simdoff
|
#pragma simdoff
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
|
|||||||
@ -154,7 +154,11 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
|||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
}
|
}
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
for (int n = nfrom; n < nto; n ++) {
|
for (int n = nfrom; n < nto; n ++) {
|
||||||
#else
|
#else
|
||||||
for (int n = nfrom; n < nto; n += npl) {
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
@ -299,7 +303,11 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp ordered simd
|
||||||
|
#else
|
||||||
#pragma simdoff
|
#pragma simdoff
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
|
|||||||
@ -158,7 +158,11 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
}
|
}
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
for (int n = nfrom; n < nto; n ++) {
|
for (int n = nfrom; n < nto; n ++) {
|
||||||
#else
|
#else
|
||||||
for (int n = nfrom; n < nto; n += npl) {
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
@ -319,7 +323,11 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp ordered simd
|
||||||
|
#else
|
||||||
#pragma simdoff
|
#pragma simdoff
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
|
|||||||
@ -635,19 +635,31 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
|
|||||||
if (_nthreads == 4) {
|
if (_nthreads == 4) {
|
||||||
acc_t *f_scalar3 = f_scalar2 + f_stride4;
|
acc_t *f_scalar3 = f_scalar2 + f_stride4;
|
||||||
acc_t *f_scalar4 = f_scalar3 + f_stride4;
|
acc_t *f_scalar4 = f_scalar3 + f_stride4;
|
||||||
_use_simd_pragma("vector aligned")
|
#if defined(USE_OMP_SIMD)
|
||||||
_use_simd_pragma("simd")
|
#pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4:64)
|
||||||
|
#elif defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma simd
|
||||||
|
#endif
|
||||||
for (int n = 0; n < o_range; n++)
|
for (int n = 0; n < o_range; n++)
|
||||||
f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
|
f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
|
||||||
} else if (_nthreads == 2) {
|
} else if (_nthreads == 2) {
|
||||||
_use_simd_pragma("vector aligned")
|
#if defined(USE_OMP_SIMD)
|
||||||
_use_simd_pragma("simd")
|
#pragma omp simd aligned(f_scalar,f_scalar2:64)
|
||||||
|
#elif defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma simd
|
||||||
|
#endif
|
||||||
for (int n = 0; n < o_range; n++)
|
for (int n = 0; n < o_range; n++)
|
||||||
f_scalar[n] += f_scalar2[n];
|
f_scalar[n] += f_scalar2[n];
|
||||||
} else {
|
} else {
|
||||||
acc_t *f_scalar3 = f_scalar2 + f_stride4;
|
acc_t *f_scalar3 = f_scalar2 + f_stride4;
|
||||||
_use_simd_pragma("vector aligned")
|
#if defined(USE_OMP_SIMD)
|
||||||
_use_simd_pragma("simd")
|
#pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3:64)
|
||||||
|
#elif defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma simd
|
||||||
|
#endif
|
||||||
for (int n = 0; n < o_range; n++)
|
for (int n = 0; n < o_range; n++)
|
||||||
f_scalar[n] += f_scalar2[n] + f_scalar3[n];
|
f_scalar[n] += f_scalar2[n] + f_scalar3[n];
|
||||||
}
|
}
|
||||||
@ -662,8 +674,12 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
|
|||||||
|
|
||||||
acc_t *f_scalar2 = f_scalar + f_stride4;
|
acc_t *f_scalar2 = f_scalar + f_stride4;
|
||||||
for (int t = 1; t < _nthreads; t++) {
|
for (int t = 1; t < _nthreads; t++) {
|
||||||
_use_simd_pragma("vector aligned")
|
#if defined(USE_OMP_SIMD)
|
||||||
_use_simd_pragma("simd")
|
#pragma omp simd aligned(f_scalar,f_scalar2:64)
|
||||||
|
#elif defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma simd
|
||||||
|
#endif
|
||||||
for (int n = iifrom; n < iito; n++)
|
for (int n = iifrom; n < iito; n++)
|
||||||
f_scalar[n] += f_scalar2[n];
|
f_scalar[n] += f_scalar2[n];
|
||||||
f_scalar2 += f_stride4;
|
f_scalar2 += f_stride4;
|
||||||
|
|||||||
@ -99,8 +99,12 @@ void FixNHIntel::remap()
|
|||||||
|
|
||||||
if (allremap) {
|
if (allremap) {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
const double d0 = x[i].x - b0;
|
const double d0 = x[i].x - b0;
|
||||||
@ -112,8 +116,12 @@ void FixNHIntel::remap()
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
if (mask[i] & dilate_group_bit) {
|
if (mask[i] & dilate_group_bit) {
|
||||||
@ -278,8 +286,12 @@ void FixNHIntel::remap()
|
|||||||
|
|
||||||
if (allremap) {
|
if (allremap) {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
|
x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
|
||||||
@ -288,8 +300,12 @@ void FixNHIntel::remap()
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
if (mask[i] & dilate_group_bit) {
|
if (mask[i] & dilate_group_bit) {
|
||||||
@ -415,8 +431,12 @@ void FixNHIntel::nh_v_press()
|
|||||||
|
|
||||||
if (igroup == 0) {
|
if (igroup == 0) {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
v[i].x *= f0;
|
v[i].x *= f0;
|
||||||
@ -425,8 +445,12 @@ void FixNHIntel::nh_v_press()
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
if (mask[i] & groupbit) {
|
if (mask[i] & groupbit) {
|
||||||
@ -448,8 +472,12 @@ void FixNHIntel::nve_v()
|
|||||||
double * _noalias const v = atom->v[0];
|
double * _noalias const v = atom->v[0];
|
||||||
const double * _noalias const f = atom->f[0];
|
const double * _noalias const f = atom->f[0];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < _nlocal3; i++)
|
for (int i = 0; i < _nlocal3; i++)
|
||||||
v[i] += _dtfm[i] * f[i];
|
v[i] += _dtfm[i] * f[i];
|
||||||
@ -468,15 +496,23 @@ void FixNHIntel::nve_x()
|
|||||||
|
|
||||||
if (igroup == 0) {
|
if (igroup == 0) {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < _nlocal3; i++)
|
for (int i = 0; i < _nlocal3; i++)
|
||||||
x[i] += dtv * v[i];
|
x[i] += dtv * v[i];
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < _nlocal3; i++) {
|
for (int i = 0; i < _nlocal3; i++) {
|
||||||
if (_dtfm[i] != 0.0)
|
if (_dtfm[i] != 0.0)
|
||||||
@ -500,15 +536,23 @@ void FixNHIntel::nh_v_temp()
|
|||||||
|
|
||||||
if (igroup == 0) {
|
if (igroup == 0) {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < _nlocal3; i++)
|
for (int i = 0; i < _nlocal3; i++)
|
||||||
v[i] *= factor_eta;
|
v[i] *= factor_eta;
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < _nlocal3; i++) {
|
for (int i = 0; i < _nlocal3; i++) {
|
||||||
if (_dtfm[i] != 0.0)
|
if (_dtfm[i] != 0.0)
|
||||||
|
|||||||
@ -97,8 +97,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
|
|||||||
dtq = 0.5 * dtv;
|
dtq = 0.5 * dtv;
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < _nlocal3; i++) {
|
for (int i = 0; i < _nlocal3; i++) {
|
||||||
v[i] += _dtfm[i] * f[i];
|
v[i] += _dtfm[i] * f[i];
|
||||||
@ -108,8 +112,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
|
|||||||
// update angular momentum by 1/2 step
|
// update angular momentum by 1/2 step
|
||||||
if (igroup == 0) {
|
if (igroup == 0) {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
double *quat = bonus[ellipsoid[i]].quat;
|
double *quat = bonus[ellipsoid[i]].quat;
|
||||||
@ -118,8 +126,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
if (mask[i] & groupbit) {
|
if (mask[i] & groupbit) {
|
||||||
@ -143,8 +155,12 @@ void FixNVEAsphereIntel::final_integrate()
|
|||||||
const double * _noalias const torque = atom->torque[0];
|
const double * _noalias const torque = atom->torque[0];
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < _nlocal3; i++) {
|
for (int i = 0; i < _nlocal3; i++) {
|
||||||
v[i] += _dtfm[i] * f[i];
|
v[i] += _dtfm[i] * f[i];
|
||||||
|
|||||||
@ -68,8 +68,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
|
|||||||
if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) {
|
if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) {
|
||||||
const double dtfm = dtf / atom->mass[1];
|
const double dtfm = dtf / atom->mass[1];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < _nlocal3; i++) {
|
for (int i = 0; i < _nlocal3; i++) {
|
||||||
v[i] += dtfm * f[i];
|
v[i] += dtfm * f[i];
|
||||||
@ -78,8 +82,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
|
|||||||
} else if (igroup == 0) {
|
} else if (igroup == 0) {
|
||||||
if (neighbor->ago == 0) reset_dt();
|
if (neighbor->ago == 0) reset_dt();
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < _nlocal3; i++) {
|
for (int i = 0; i < _nlocal3; i++) {
|
||||||
v[i] += _dtfm[i] * f[i];
|
v[i] += _dtfm[i] * f[i];
|
||||||
@ -88,8 +96,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/)
|
|||||||
} else {
|
} else {
|
||||||
if (neighbor->ago == 0) reset_dt();
|
if (neighbor->ago == 0) reset_dt();
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < _nlocal3; i++) {
|
for (int i = 0; i < _nlocal3; i++) {
|
||||||
if (_dtfm[i] != 0.0) {
|
if (_dtfm[i] != 0.0) {
|
||||||
@ -112,16 +124,24 @@ void FixNVEIntel::final_integrate()
|
|||||||
_nlocal3 = 3 * atom->nlocal;
|
_nlocal3 = 3 * atom->nlocal;
|
||||||
const double dtfm = dtf / atom->mass[1];
|
const double dtfm = dtf / atom->mass[1];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < _nlocal3; i++)
|
for (int i = 0; i < _nlocal3; i++)
|
||||||
v[i] += dtfm * f[i];
|
v[i] += dtfm * f[i];
|
||||||
} else if (igroup == 0) {
|
} else if (igroup == 0) {
|
||||||
if (neighbor->ago == 0) reset_dt();
|
if (neighbor->ago == 0) reset_dt();
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < _nlocal3; i++) {
|
for (int i = 0; i < _nlocal3; i++) {
|
||||||
v[i] += _dtfm[i] * f[i];
|
v[i] += _dtfm[i] * f[i];
|
||||||
@ -129,8 +149,12 @@ void FixNVEIntel::final_integrate()
|
|||||||
} else {
|
} else {
|
||||||
if (neighbor->ago == 0) reset_dt();
|
if (neighbor->ago == 0) reset_dt();
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < _nlocal3; i++)
|
for (int i = 0; i < _nlocal3; i++)
|
||||||
v[i] += _dtfm[i] * f[i];
|
v[i] += _dtfm[i] * f[i];
|
||||||
|
|||||||
@ -165,7 +165,11 @@ void ImproperCvffIntel::eval(const int vflag,
|
|||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
}
|
}
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
|
#pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
for (int n = nfrom; n < nto; n++) {
|
for (int n = nfrom; n < nto; n++) {
|
||||||
#else
|
#else
|
||||||
for (int n = nfrom; n < nto; n += npl) {
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
@ -247,7 +251,11 @@ void ImproperCvffIntel::eval(const int vflag,
|
|||||||
|
|
||||||
flt_t p, pd;
|
flt_t p, pd;
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp ordered simd
|
||||||
|
#else
|
||||||
#pragma simdoff
|
#pragma simdoff
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (m == 2) {
|
if (m == 2) {
|
||||||
@ -319,7 +327,11 @@ void ImproperCvffIntel::eval(const int vflag,
|
|||||||
// apply force to each of 4 atoms
|
// apply force to each of 4 atoms
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp ordered simd
|
||||||
|
#else
|
||||||
#pragma simdoff
|
#pragma simdoff
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
|
|||||||
@ -167,7 +167,11 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
|||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
}
|
}
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
|
#pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
for (int n = nfrom; n < nto; n++) {
|
for (int n = nfrom; n < nto; n++) {
|
||||||
#else
|
#else
|
||||||
for (int n = nfrom; n < nto; n += npl) {
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
@ -276,7 +280,11 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
|||||||
// apply force to each of 4 atoms
|
// apply force to each of 4 atoms
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp ordered simd
|
||||||
|
#else
|
||||||
#pragma simdoff
|
#pragma simdoff
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
|
|||||||
@ -127,7 +127,8 @@ struct vector_ops<double, KNC> {
|
|||||||
}
|
}
|
||||||
template<int scale>
|
template<int scale>
|
||||||
static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) {
|
static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) {
|
||||||
return _mm512_mask_i32logather_pd(from, mask, idx, base, scale);
|
return _mm512_mask_i32gather_pd(from, mask, _mm512_castsi512_si256(idx),
|
||||||
|
base, scale);
|
||||||
}
|
}
|
||||||
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
|
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
|
||||||
return _mm512_mask_blend_pd(mask, a, b);
|
return _mm512_mask_blend_pd(mask, a, b);
|
||||||
|
|||||||
@ -511,7 +511,8 @@ public:
|
|||||||
const int scale) {
|
const int scale) {
|
||||||
assert(scale == sizeof(FVEC_SCAL_T));
|
assert(scale == sizeof(FVEC_SCAL_T));
|
||||||
# if FVEC_LEN==8
|
# if FVEC_LEN==8
|
||||||
return FVEC_SUFFIX(_mm512_i32logather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
|
return FVEC_SUFFIX(_mm512_i32gather_)(_mm512_castsi512_si256(idx.val_),
|
||||||
|
mem, sizeof(FVEC_SCAL_T));
|
||||||
# else
|
# else
|
||||||
return FVEC_SUFFIX(_mm512_i32gather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
|
return FVEC_SUFFIX(_mm512_i32gather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
|
||||||
# endif
|
# endif
|
||||||
@ -522,8 +523,8 @@ public:
|
|||||||
) {
|
) {
|
||||||
assert(scale == sizeof(FVEC_SCAL_T));
|
assert(scale == sizeof(FVEC_SCAL_T));
|
||||||
# if FVEC_LEN==8
|
# if FVEC_LEN==8
|
||||||
return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
|
return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_,
|
||||||
mem, sizeof(FVEC_SCAL_T));
|
_mm512_castsi512_si256(idx.val_), mem, sizeof(FVEC_SCAL_T));
|
||||||
# else
|
# else
|
||||||
return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
|
return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
|
||||||
mem, sizeof(FVEC_SCAL_T));
|
mem, sizeof(FVEC_SCAL_T));
|
||||||
@ -609,8 +610,8 @@ public:
|
|||||||
) {
|
) {
|
||||||
assert(scale == sizeof(FVEC_SCAL_T));
|
assert(scale == sizeof(FVEC_SCAL_T));
|
||||||
# if FVEC_LEN==8
|
# if FVEC_LEN==8
|
||||||
return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
|
return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_,
|
||||||
mem, sizeof(FVEC_SCAL_T));
|
_mm512_castsi512_si256(idx.val_), mem, sizeof(FVEC_SCAL_T));
|
||||||
# else
|
# else
|
||||||
return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
|
return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
|
||||||
mem, sizeof(FVEC_SCAL_T));
|
mem, sizeof(FVEC_SCAL_T));
|
||||||
@ -622,8 +623,9 @@ public:
|
|||||||
) {
|
) {
|
||||||
assert(scale == sizeof(FVEC_SCAL_T));
|
assert(scale == sizeof(FVEC_SCAL_T));
|
||||||
# if FVEC_LEN==8
|
# if FVEC_LEN==8
|
||||||
FVEC_SUFFIX(_mm512_mask_i32loscatter_)(mem, mask.val_, idx.val_, a.val_,
|
FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_,
|
||||||
sizeof(FVEC_SCAL_T));
|
_mm512_castsi512_si256(idx.val_),
|
||||||
|
a.val_, sizeof(FVEC_SCAL_T));
|
||||||
# else
|
# else
|
||||||
FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, idx.val_, a.val_,
|
FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, idx.val_, a.val_,
|
||||||
sizeof(FVEC_SCAL_T));
|
sizeof(FVEC_SCAL_T));
|
||||||
@ -666,11 +668,11 @@ public:
|
|||||||
const double * mem, const int scale
|
const double * mem, const int scale
|
||||||
) {
|
) {
|
||||||
assert(scale == sizeof(double));
|
assert(scale == sizeof(double));
|
||||||
__m512d lo = _mm512_mask_i32logather_pd(src.lo_, mask.val_, idx.val_, mem,
|
__m512d lo = _mm512_mask_i32gather_pd(src.lo_, mask.val_,
|
||||||
sizeof(double));
|
_mm512_castsi512_si256(idx.val_),
|
||||||
__m512d hi = _mm512_mask_i32logather_pd(src.hi_, get_bvec_hi(mask.val_),
|
mem, sizeof(double));
|
||||||
get_ivec_hi(idx.val_), mem,
|
__m512d hi = _mm512_mask_i32gather_pd(src.hi_, get_bvec_hi(mask.val_),
|
||||||
sizeof(double));
|
_mm512_castsi512_si256(get_ivec_hi(idx.val_)), mem, sizeof(double));
|
||||||
return avec16pd(lo, hi);
|
return avec16pd(lo, hi);
|
||||||
}
|
}
|
||||||
VEC_INLINE static void mask_i32loscatter(
|
VEC_INLINE static void mask_i32loscatter(
|
||||||
@ -678,10 +680,12 @@ public:
|
|||||||
const avec16pd &a, const int scale
|
const avec16pd &a, const int scale
|
||||||
) {
|
) {
|
||||||
assert(scale == sizeof(double));
|
assert(scale == sizeof(double));
|
||||||
_mm512_mask_i32loscatter_pd(mem, mask.val_, idx.val_, a.lo_,
|
_mm512_mask_i32scatter_pd(mem, mask.val_,
|
||||||
|
_mm512_castsi512_si256(idx.val_), a.lo_,
|
||||||
sizeof(double));
|
sizeof(double));
|
||||||
_mm512_mask_i32loscatter_pd(mem, get_bvec_hi(mask.val_),
|
_mm512_mask_i32scatter_pd(mem, get_bvec_hi(mask.val_),
|
||||||
get_ivec_hi(idx.val_), a.hi_, sizeof(double));
|
_mm512_castsi512_si256(get_ivec_hi(idx.val_)),
|
||||||
|
a.hi_, sizeof(double));
|
||||||
}
|
}
|
||||||
|
|
||||||
#define AVEC2_BINOP(the_sym, the_name) \
|
#define AVEC2_BINOP(the_sym, the_name) \
|
||||||
|
|||||||
@ -17,8 +17,13 @@
|
|||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#ifdef __INTEL_LLVM_COMPILER
|
#ifdef __INTEL_LLVM_COMPILER
|
||||||
|
#define USE_OMP_SIMD
|
||||||
#define __INTEL_COMPILER __INTEL_LLVM_COMPILER
|
#define __INTEL_COMPILER __INTEL_LLVM_COMPILER
|
||||||
#define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER
|
#define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER
|
||||||
|
#define _MM_SCALE_1 1
|
||||||
|
#define _MM_SCALE_2 2
|
||||||
|
#define _MM_SCALE_4 4
|
||||||
|
#define _MM_SCALE_8 8
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __INTEL_COMPILER
|
#ifdef __INTEL_COMPILER
|
||||||
@ -332,6 +337,9 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// TO BE DEPRECATED
|
||||||
|
#ifndef USE_OMP_SIMD
|
||||||
|
|
||||||
#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \
|
#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \
|
||||||
f_stride, pos, ov0, ov1, ov2, \
|
f_stride, pos, ov0, ov1, ov2, \
|
||||||
ov3, ov4, ov5) \
|
ov3, ov4, ov5) \
|
||||||
@ -526,6 +534,198 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
|||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \
|
||||||
|
f_stride, pos, ov0, ov1, ov2, \
|
||||||
|
ov3, ov4, ov5) \
|
||||||
|
{ \
|
||||||
|
acc_t *f_scalar = &f_start[0].x; \
|
||||||
|
flt_t *x_scalar = &pos[minlocal].x; \
|
||||||
|
int f_stride4 = f_stride * 4; \
|
||||||
|
_alignvar(acc_t ovv[16],64); \
|
||||||
|
int vwidth; \
|
||||||
|
if (sizeof(acc_t) == sizeof(double)) \
|
||||||
|
vwidth = INTEL_COMPILE_WIDTH/2; \
|
||||||
|
else \
|
||||||
|
vwidth = INTEL_COMPILE_WIDTH; \
|
||||||
|
if (vwidth < 4) vwidth = 4; \
|
||||||
|
_use_simd_pragma("omp simd aligned(ovv:64)") \
|
||||||
|
for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0; \
|
||||||
|
int remainder = lt % vwidth; \
|
||||||
|
if (lf > lt) remainder = 0; \
|
||||||
|
const int v_range = lt - remainder; \
|
||||||
|
if (nthreads == 2) { \
|
||||||
|
acc_t *f_scalar2 = f_scalar + f_stride4; \
|
||||||
|
for (int n = lf; n < v_range; n += vwidth) { \
|
||||||
|
_use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,ovv,x_scalar:64)")\
|
||||||
|
for (int v = 0; v < vwidth; v++) { \
|
||||||
|
f_scalar[n+v] += f_scalar2[n+v]; \
|
||||||
|
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||||
|
} \
|
||||||
|
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||||
|
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||||
|
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||||
|
if (vwidth > 4) { \
|
||||||
|
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
|
||||||
|
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
|
||||||
|
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
|
||||||
|
} \
|
||||||
|
if (vwidth > 8) { \
|
||||||
|
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
|
||||||
|
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
|
||||||
|
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
|
||||||
|
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
|
||||||
|
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
|
||||||
|
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
_use_simd_pragma("vector aligned") \
|
||||||
|
_use_simd_pragma("ivdep") \
|
||||||
|
_use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
|
||||||
|
for (int n = v_range; n < lt; n++) \
|
||||||
|
f_scalar[n] += f_scalar2[n]; \
|
||||||
|
} else if (nthreads==4) { \
|
||||||
|
acc_t *f_scalar2 = f_scalar + f_stride4; \
|
||||||
|
acc_t *f_scalar3 = f_scalar2 + f_stride4; \
|
||||||
|
acc_t *f_scalar4 = f_scalar3 + f_stride4; \
|
||||||
|
for (int n = lf; n < v_range; n += vwidth) { \
|
||||||
|
_use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4,ovv:64)") \
|
||||||
|
for (int v = 0; v < vwidth; v++) { \
|
||||||
|
f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] + \
|
||||||
|
f_scalar4[n+v]; \
|
||||||
|
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||||
|
} \
|
||||||
|
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||||
|
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||||
|
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||||
|
if (vwidth > 4) { \
|
||||||
|
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
|
||||||
|
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
|
||||||
|
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
|
||||||
|
} \
|
||||||
|
if (vwidth > 8) { \
|
||||||
|
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
|
||||||
|
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
|
||||||
|
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
|
||||||
|
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
|
||||||
|
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
|
||||||
|
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
_use_simd_pragma("vector aligned") \
|
||||||
|
_use_simd_pragma("ivdep") \
|
||||||
|
_use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
|
||||||
|
for (int n = v_range; n < lt; n++) \
|
||||||
|
f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; \
|
||||||
|
} else if (nthreads==1) { \
|
||||||
|
for (int n = lf; n < v_range; n += vwidth) { \
|
||||||
|
_use_simd_pragma("omp simd aligned(ovv,f_scalar,x_scalar:64)") \
|
||||||
|
for (int v = 0; v < vwidth; v++) \
|
||||||
|
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||||
|
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||||
|
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||||
|
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||||
|
if (vwidth > 4) { \
|
||||||
|
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
|
||||||
|
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
|
||||||
|
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
|
||||||
|
} \
|
||||||
|
if (vwidth > 8) { \
|
||||||
|
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
|
||||||
|
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
|
||||||
|
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
|
||||||
|
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
|
||||||
|
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
|
||||||
|
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} else if (nthreads==3) { \
|
||||||
|
acc_t *f_scalar2 = f_scalar + f_stride4; \
|
||||||
|
acc_t *f_scalar3 = f_scalar2 + f_stride4; \
|
||||||
|
for (int n = lf; n < v_range; n += vwidth) { \
|
||||||
|
_use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,ovv,x_scalar:64)") \
|
||||||
|
for (int v = 0; v < vwidth; v++) { \
|
||||||
|
f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v]; \
|
||||||
|
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||||
|
} \
|
||||||
|
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||||
|
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||||
|
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||||
|
if (vwidth > 4) { \
|
||||||
|
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
|
||||||
|
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
|
||||||
|
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
|
||||||
|
} \
|
||||||
|
if (vwidth > 8) { \
|
||||||
|
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
|
||||||
|
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
|
||||||
|
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
|
||||||
|
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
|
||||||
|
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
|
||||||
|
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
_use_simd_pragma("vector aligned") \
|
||||||
|
_use_simd_pragma("ivdep") \
|
||||||
|
_use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
|
||||||
|
for (int n = v_range; n < lt; n++) \
|
||||||
|
f_scalar[n] += f_scalar2[n] + f_scalar3[n]; \
|
||||||
|
} \
|
||||||
|
for (int n = v_range; n < lt; n += 4) { \
|
||||||
|
_use_simd_pragma("vector aligned") \
|
||||||
|
_use_simd_pragma("ivdep") \
|
||||||
|
for (int v = 0; v < 4; v++) \
|
||||||
|
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||||
|
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||||
|
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||||
|
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||||
|
} \
|
||||||
|
ov0 += ovv[0]; \
|
||||||
|
ov1 += ovv[1]; \
|
||||||
|
ov2 += ovv[2]; \
|
||||||
|
if (vwidth > 4) { \
|
||||||
|
ov0 += ovv[4]; \
|
||||||
|
ov1 += ovv[5]; \
|
||||||
|
ov2 += ovv[6]; \
|
||||||
|
} \
|
||||||
|
if (vwidth > 8) { \
|
||||||
|
ov0 += ovv[8] + ovv[12]; \
|
||||||
|
ov1 += ovv[9] + ovv[13]; \
|
||||||
|
ov2 += ovv[10] + ovv[14]; \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \
|
||||||
|
f_stride, pos, offload, vflag, ov0, ov1, \
|
||||||
|
ov2, ov3, ov4, ov5) \
|
||||||
|
{ \
|
||||||
|
int o_range = (nall - minlocal) * 4; \
|
||||||
|
IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads, \
|
||||||
|
sizeof(acc_t)); \
|
||||||
|
\
|
||||||
|
acc_t *f_scalar = &f_start[0].x; \
|
||||||
|
int f_stride4 = f_stride * 4; \
|
||||||
|
int t; \
|
||||||
|
if (vflag == VIRIAL_FDOTR) t = 4; else t = 1; \
|
||||||
|
acc_t *f_scalar2 = f_scalar + f_stride4 * t; \
|
||||||
|
for ( ; t < nthreads; t++) { \
|
||||||
|
_use_simd_pragma("omp simd aligned(f_scalar,f_scalar2:64)") \
|
||||||
|
for (int n = iifrom; n < iito; n++) \
|
||||||
|
f_scalar[n] += f_scalar2[n]; \
|
||||||
|
f_scalar2 += f_stride4; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if (vflag == VIRIAL_FDOTR) { \
|
||||||
|
int nt_min = MIN(4,nthreads); \
|
||||||
|
IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start, \
|
||||||
|
f_stride, pos, ov0, ov1, ov2, ov3, ov4, \
|
||||||
|
ov5); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
|
|
||||||
|
|||||||
@ -173,7 +173,7 @@ namespace ip_simd {
|
|||||||
}
|
}
|
||||||
|
|
||||||
inline SIMD_double SIMD_gather(const double *p, const SIMD_int &i) {
|
inline SIMD_double SIMD_gather(const double *p, const SIMD_int &i) {
|
||||||
return _mm512_i32logather_pd(i, p, _MM_SCALE_8);
|
return _mm512_i32gather_pd(_mm512_castsi512_si256(i), p, _MM_SCALE_8);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline SIMD_int SIMD_gather(const SIMD_mask &m, const int *p,
|
inline SIMD_int SIMD_gather(const SIMD_mask &m, const int *p,
|
||||||
@ -190,8 +190,8 @@ namespace ip_simd {
|
|||||||
|
|
||||||
inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p,
|
inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p,
|
||||||
const SIMD_int &i) {
|
const SIMD_int &i) {
|
||||||
return _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, p,
|
return _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
_MM_SCALE_8);
|
_mm512_castsi512_si256(i), p, _MM_SCALE_8);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
@ -227,8 +227,8 @@ namespace ip_simd {
|
|||||||
|
|
||||||
inline SIMD_double SIMD_gatherz(const SIMD_mask &m, const double *p,
|
inline SIMD_double SIMD_gatherz(const SIMD_mask &m, const double *p,
|
||||||
const SIMD_int &i) {
|
const SIMD_int &i) {
|
||||||
return _mm512_mask_i32logather_pd( _mm512_set1_pd(0.0), m, i, p,
|
return _mm512_mask_i32gather_pd( _mm512_set1_pd(0.0), m,
|
||||||
_MM_SCALE_8);
|
_mm512_castsi512_si256(i),p, _MM_SCALE_8);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ------- Store Operations
|
// ------- Store Operations
|
||||||
@ -257,7 +257,8 @@ namespace ip_simd {
|
|||||||
|
|
||||||
inline void SIMD_scatter(const SIMD_mask &m, double *p,
|
inline void SIMD_scatter(const SIMD_mask &m, double *p,
|
||||||
const SIMD_int &i, const SIMD_double &vec) {
|
const SIMD_int &i, const SIMD_double &vec) {
|
||||||
_mm512_mask_i32loscatter_pd(p, m, i, vec, _MM_SCALE_8);
|
_mm512_mask_i32scatter_pd(p, m, _mm512_castsi512_si256(i), vec,
|
||||||
|
_MM_SCALE_8);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ------- Arithmetic Operations
|
// ------- Arithmetic Operations
|
||||||
@ -834,22 +835,28 @@ namespace ip_simd {
|
|||||||
inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
|
inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
|
||||||
const SIMD_int &i, SIMD_double &x,
|
const SIMD_int &i, SIMD_double &x,
|
||||||
SIMD_double &y, SIMD_double &z) {
|
SIMD_double &y, SIMD_double &z) {
|
||||||
x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
|
x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
|
_mm512_castsi512_si256(i), atom,
|
||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
|
y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
|
_mm512_castsi512_si256(i), atom+1,
|
||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
|
z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
|
_mm512_castsi512_si256(i), atom+2,
|
||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
|
inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
|
||||||
const SIMD_int &i, SIMD_double &x,
|
const SIMD_int &i, SIMD_double &x,
|
||||||
SIMD_double &y, SIMD_double &z, SIMD_int &type) {
|
SIMD_double &y, SIMD_double &z, SIMD_int &type) {
|
||||||
x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
|
x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
|
_mm512_castsi512_si256(i), atom,
|
||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
|
y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
|
_mm512_castsi512_si256(i), atom+1,
|
||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
|
z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
|
_mm512_castsi512_si256(i), atom+2,
|
||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3,
|
type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3,
|
||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
@ -888,10 +895,12 @@ namespace ip_simd {
|
|||||||
const SIMD_int &joffset, SIMD_double &eng) {
|
const SIMD_int &joffset, SIMD_double &eng) {
|
||||||
SIMD_double jeng;
|
SIMD_double jeng;
|
||||||
SIMD_conflict_pi_reduce1(rmask, joffset, eng);
|
SIMD_conflict_pi_reduce1(rmask, joffset, eng);
|
||||||
jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
|
||||||
|
_mm512_castsi512_si256(joffset),
|
||||||
force, _MM_SCALE_2);
|
force, _MM_SCALE_2);
|
||||||
jeng = jeng + eng;
|
jeng = jeng + eng;
|
||||||
_mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
|
||||||
|
jeng, _MM_SCALE_2);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force,
|
inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force,
|
||||||
@ -899,20 +908,24 @@ namespace ip_simd {
|
|||||||
SIMD_double engd, jeng;
|
SIMD_double engd, jeng;
|
||||||
engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng));
|
engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng));
|
||||||
SIMD_conflict_pi_reduce1(rmask, joffset, engd);
|
SIMD_conflict_pi_reduce1(rmask, joffset, engd);
|
||||||
jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
|
||||||
|
_mm512_castsi512_si256(joffset),
|
||||||
force, _MM_SCALE_2);
|
force, _MM_SCALE_2);
|
||||||
jeng = jeng + engd;
|
jeng = jeng + engd;
|
||||||
_mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
|
||||||
|
jeng, _MM_SCALE_2);
|
||||||
|
|
||||||
SIMD_mask rmask2 = rmask >> 8;
|
SIMD_mask rmask2 = rmask >> 8;
|
||||||
engd = _mm512_cvtps_pd(_mm512_castps512_ps256(
|
engd = _mm512_cvtps_pd(_mm512_castps512_ps256(
|
||||||
_mm512_shuffle_f32x4(eng,eng,238)));
|
_mm512_shuffle_f32x4(eng,eng,238)));
|
||||||
SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
|
SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
|
||||||
SIMD_conflict_pi_reduce1(rmask2, joffset2, engd);
|
SIMD_conflict_pi_reduce1(rmask2, joffset2, engd);
|
||||||
jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
|
jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
|
||||||
|
_mm512_castsi512_si256(joffset2),
|
||||||
force, _MM_SCALE_2);
|
force, _MM_SCALE_2);
|
||||||
jeng = jeng + engd;
|
jeng = jeng + engd;
|
||||||
_mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jeng, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2),
|
||||||
|
jeng, _MM_SCALE_2);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force,
|
inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force,
|
||||||
@ -926,10 +939,12 @@ namespace ip_simd {
|
|||||||
|
|
||||||
SIMD_double jeng;
|
SIMD_double jeng;
|
||||||
SIMD_conflict_pi_reduce1(rmask, joffset, eng);
|
SIMD_conflict_pi_reduce1(rmask, joffset, eng);
|
||||||
jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
|
||||||
|
_mm512_castsi512_si256(joffset),
|
||||||
force, _MM_SCALE_2);
|
force, _MM_SCALE_2);
|
||||||
jeng = jeng + eng;
|
jeng = jeng + eng;
|
||||||
_mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
|
||||||
|
jeng, _MM_SCALE_2);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void SIMD_safe_jforce(const SIMD_mask &m, float *force,
|
inline void SIMD_safe_jforce(const SIMD_mask &m, float *force,
|
||||||
@ -956,18 +971,24 @@ namespace ip_simd {
|
|||||||
SIMD_double &fy, SIMD_double &fz) {
|
SIMD_double &fy, SIMD_double &fz) {
|
||||||
SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
|
SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
|
||||||
SIMD_double jfrc;
|
SIMD_double jfrc;
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
|
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
|
_mm512_castsi512_si256(i), force,
|
||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
jfrc = jfrc + fx;
|
jfrc = jfrc + fx;
|
||||||
_mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
|
_MM_SCALE_2);
|
||||||
|
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
|
_mm512_castsi512_si256(i), force + 1,
|
||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
jfrc = jfrc + fy;
|
jfrc = jfrc + fy;
|
||||||
_mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
|
_MM_SCALE_2);
|
||||||
|
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
|
_mm512_castsi512_si256(i), force + 2,
|
||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
jfrc = jfrc + fz;
|
jfrc = jfrc + fz;
|
||||||
_mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
|
||||||
|
_MM_SCALE_2);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force,
|
inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force,
|
||||||
@ -979,18 +1000,24 @@ namespace ip_simd {
|
|||||||
amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz));
|
amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz));
|
||||||
SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd);
|
SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd);
|
||||||
SIMD_double jfrc;
|
SIMD_double jfrc;
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
|
||||||
|
_mm512_castsi512_si256(joffset),
|
||||||
force, _MM_SCALE_2);
|
force, _MM_SCALE_2);
|
||||||
jfrc = jfrc + amxd;
|
jfrc = jfrc + amxd;
|
||||||
_mm512_mask_i32loscatter_pd(force, rmask, joffset, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
jfrc, _MM_SCALE_2);
|
||||||
|
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
|
||||||
|
_mm512_castsi512_si256(joffset),
|
||||||
force + 1, _MM_SCALE_2);
|
force + 1, _MM_SCALE_2);
|
||||||
jfrc = jfrc + amyd;
|
jfrc = jfrc + amyd;
|
||||||
_mm512_mask_i32loscatter_pd(force+1, rmask, joffset, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force+1, rmask, _mm512_castsi512_si256(joffset),
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
|
jfrc, _MM_SCALE_2);
|
||||||
|
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
|
||||||
|
_mm512_castsi512_si256(joffset),
|
||||||
force + 2, _MM_SCALE_2);
|
force + 2, _MM_SCALE_2);
|
||||||
jfrc = jfrc + amzd;
|
jfrc = jfrc + amzd;
|
||||||
_mm512_mask_i32loscatter_pd(force+2, rmask, joffset, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force+2, rmask, _mm512_castsi512_si256(joffset),
|
||||||
|
jfrc, _MM_SCALE_2);
|
||||||
|
|
||||||
SIMD_mask rmask2 = rmask >> 8;
|
SIMD_mask rmask2 = rmask >> 8;
|
||||||
amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
|
amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
|
||||||
@ -1001,18 +1028,26 @@ namespace ip_simd {
|
|||||||
_mm512_shuffle_f32x4(amz,amz,238)));
|
_mm512_shuffle_f32x4(amz,amz,238)));
|
||||||
SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
|
SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
|
||||||
SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd);
|
SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd);
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
|
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
|
||||||
|
_mm512_castsi512_si256(joffset2),
|
||||||
force, _MM_SCALE_2);
|
force, _MM_SCALE_2);
|
||||||
jfrc = jfrc + amxd;
|
jfrc = jfrc + amxd;
|
||||||
_mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2),
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
|
jfrc, _MM_SCALE_2);
|
||||||
|
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
|
||||||
|
_mm512_castsi512_si256(joffset2),
|
||||||
force + 1, _MM_SCALE_2);
|
force + 1, _MM_SCALE_2);
|
||||||
jfrc = jfrc + amyd;
|
jfrc = jfrc + amyd;
|
||||||
_mm512_mask_i32loscatter_pd(force+1, rmask2, joffset2, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force+1, rmask2,
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
|
_mm512_castsi512_si256(joffset2), jfrc,
|
||||||
|
_MM_SCALE_2);
|
||||||
|
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
|
||||||
|
_mm512_castsi512_si256(joffset2),
|
||||||
force + 2, _MM_SCALE_2);
|
force + 2, _MM_SCALE_2);
|
||||||
jfrc = jfrc + amzd;
|
jfrc = jfrc + amzd;
|
||||||
_mm512_mask_i32loscatter_pd(force+2, rmask2, joffset2, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force+2, rmask2,
|
||||||
|
_mm512_castsi512_si256(joffset2), jfrc,
|
||||||
|
_MM_SCALE_2);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void SIMD_jforce_update(const SIMD_mask &m, float *force,
|
inline void SIMD_jforce_update(const SIMD_mask &m, float *force,
|
||||||
@ -1064,18 +1099,24 @@ namespace ip_simd {
|
|||||||
const SIMD_int &i, const SIMD_double &fx,
|
const SIMD_int &i, const SIMD_double &fx,
|
||||||
const SIMD_double &fy, const SIMD_double &fz) {
|
const SIMD_double &fy, const SIMD_double &fz) {
|
||||||
SIMD_double jfrc;
|
SIMD_double jfrc;
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
|
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
|
_mm512_castsi512_si256(i), force,
|
||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
jfrc = jfrc - fx;
|
jfrc = jfrc - fx;
|
||||||
_mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
|
_MM_SCALE_2);
|
||||||
|
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
|
_mm512_castsi512_si256(i), force + 1,
|
||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
jfrc = jfrc - fy;
|
jfrc = jfrc - fy;
|
||||||
_mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
|
_MM_SCALE_2);
|
||||||
|
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
|
_mm512_castsi512_si256(i), force + 2,
|
||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
jfrc = jfrc - fz;
|
jfrc = jfrc - fz;
|
||||||
_mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
|
||||||
|
_MM_SCALE_2);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void SIMD_jforce_update(const SIMD_mask &rmask,
|
inline void SIMD_jforce_update(const SIMD_mask &rmask,
|
||||||
@ -1502,11 +1543,12 @@ namespace ip_simd {
|
|||||||
fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
|
fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
|
||||||
fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
|
fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
|
||||||
SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
|
SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
|
||||||
SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
|
SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask,
|
||||||
hmask, k, force + 3,
|
_mm512_castsi512_si256(k),
|
||||||
_MM_SCALE_2);
|
force + 3, _MM_SCALE_2);
|
||||||
keng = keng + hevdwl;
|
keng = keng + hevdwl;
|
||||||
_mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k),
|
||||||
|
keng, _MM_SCALE_2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1523,11 +1565,12 @@ namespace ip_simd {
|
|||||||
fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
|
fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
|
||||||
fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
|
fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
|
||||||
SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
|
SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
|
||||||
SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
|
SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask,
|
||||||
hmask, k, force + 3,
|
_mm512_castsi512_si256(k),
|
||||||
_MM_SCALE_2);
|
force + 3, _MM_SCALE_2);
|
||||||
keng = keng + hevdwl;
|
keng = keng + hevdwl;
|
||||||
_mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k),
|
||||||
|
keng, _MM_SCALE_2);
|
||||||
}
|
}
|
||||||
SIMD_mask hmask2 = hmask >> 8;
|
SIMD_mask hmask2 = hmask >> 8;
|
||||||
facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(
|
facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(
|
||||||
@ -1539,11 +1582,13 @@ namespace ip_simd {
|
|||||||
fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl);
|
fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl);
|
||||||
SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238);
|
SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238);
|
||||||
SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl);
|
SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl);
|
||||||
SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
|
SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(),
|
||||||
hmask2, k2, force + 3,
|
hmask2,
|
||||||
_MM_SCALE_2);
|
_mm512_castsi512_si256(k2),
|
||||||
|
force + 3, _MM_SCALE_2);
|
||||||
keng = keng + hevdwl;
|
keng = keng + hevdwl;
|
||||||
_mm512_mask_i32loscatter_pd(force + 3, hmask2, k2, keng, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force + 3, hmask2, _mm512_castsi512_si256(k2),
|
||||||
|
keng, _MM_SCALE_2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1815,24 +1860,32 @@ namespace ip_simd {
|
|||||||
const int EFLAG, const int eatom,
|
const int EFLAG, const int eatom,
|
||||||
const SIMD_double &fwtmp) {
|
const SIMD_double &fwtmp) {
|
||||||
SIMD_double jfrc;
|
SIMD_double jfrc;
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
|
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
|
_mm512_castsi512_si256(i), force,
|
||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
jfrc = jfrc + fx;
|
jfrc = jfrc + fx;
|
||||||
_mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
|
_MM_SCALE_2);
|
||||||
|
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
|
_mm512_castsi512_si256(i), force + 1,
|
||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
jfrc = jfrc + fy;
|
jfrc = jfrc + fy;
|
||||||
_mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
|
_MM_SCALE_2);
|
||||||
|
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
|
_mm512_castsi512_si256(i), force + 2,
|
||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
jfrc = jfrc + fz;
|
jfrc = jfrc + fz;
|
||||||
_mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
|
||||||
|
_MM_SCALE_2);
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i,
|
jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
|
||||||
|
_mm512_castsi512_si256(i),
|
||||||
force + 3, _MM_SCALE_2);
|
force + 3, _MM_SCALE_2);
|
||||||
jfrc = jfrc + fwtmp;
|
jfrc = jfrc + fwtmp;
|
||||||
_mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32scatter_pd(force+3, m, _mm512_castsi512_si256(i), jfrc,
|
||||||
|
_MM_SCALE_2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -324,7 +324,11 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
|
|||||||
const int bstart = binhead[ibin + binstart[k]];
|
const int bstart = binhead[ibin + binstart[k]];
|
||||||
const int bend = binhead[ibin + binend[k]];
|
const int bend = binhead[ibin + binend[k]];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int jj = bstart; jj < bend; jj++)
|
for (int jj = bstart; jj < bend; jj++)
|
||||||
tj[ncount++] = binpacked[jj];
|
tj[ncount++] = binpacked[jj];
|
||||||
@ -345,15 +349,23 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
|
|||||||
const int bstart = binhead[ibin + stencil[k]];
|
const int bstart = binhead[ibin + stencil[k]];
|
||||||
const int bend = binhead[ibin + stencil[k] + 1];
|
const int bend = binhead[ibin + stencil[k] + 1];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int jj = bstart; jj < bend; jj++)
|
for (int jj = bstart; jj < bend; jj++)
|
||||||
tj[ncount++] = binpacked[jj];
|
tj[ncount++] = binpacked[jj];
|
||||||
}
|
}
|
||||||
} // if i < nlocal
|
} // if i < nlocal
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int u = 0; u < ncount; u++) {
|
for (int u = 0; u < ncount; u++) {
|
||||||
const int j = tj[u];
|
const int j = tj[u];
|
||||||
@ -425,12 +437,16 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
|
|||||||
int alln = n;
|
int alln = n;
|
||||||
n = 0;
|
n = 0;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
|
||||||
#ifdef LMP_INTEL_NBOR_COMPAT
|
#ifdef LMP_INTEL_NBOR_COMPAT
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#else
|
#else
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int u = 0; u < alln; u++) {
|
for (int u = 0; u < alln; u++) {
|
||||||
int which;
|
int which;
|
||||||
@ -454,12 +470,16 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
|
|||||||
alln = n2;
|
alln = n2;
|
||||||
n2 = maxnbors * 2;
|
n2 = maxnbors * 2;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
|
||||||
#ifdef LMP_INTEL_NBOR_COMPAT
|
#ifdef LMP_INTEL_NBOR_COMPAT
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#else
|
#else
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int u = n2; u < alln; u++) {
|
for (int u = n2; u < alln; u++) {
|
||||||
int which;
|
int which;
|
||||||
|
|||||||
@ -344,14 +344,22 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||||||
const int bstart = binhead[ibin + binstart[k]];
|
const int bstart = binhead[ibin + binstart[k]];
|
||||||
const int bend = binhead[ibin + binend[k]];
|
const int bend = binhead[ibin + binend[k]];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int jj = bstart; jj < bend; jj++)
|
for (int jj = bstart; jj < bend; jj++)
|
||||||
tj[ncount++] = binpacked[jj];
|
tj[ncount++] = binpacked[jj];
|
||||||
}
|
}
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int u = 0; u < ncount; u++) {
|
for (int u = 0; u < ncount; u++) {
|
||||||
const int j = tj[u];
|
const int j = tj[u];
|
||||||
@ -375,7 +383,11 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||||||
const int bstart = binhead[ibin];
|
const int bstart = binhead[ibin];
|
||||||
const int bend = binhead[ibin + 1];
|
const int bend = binhead[ibin + 1];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int jj = bstart; jj < bend; jj++) {
|
for (int jj = bstart; jj < bend; jj++) {
|
||||||
const int j = binpacked[jj];
|
const int j = binpacked[jj];
|
||||||
@ -533,12 +545,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||||||
|
|
||||||
n = pack_offset;
|
n = pack_offset;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
|
||||||
#ifdef LMP_INTEL_NBOR_COMPAT
|
#ifdef LMP_INTEL_NBOR_COMPAT
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#else
|
#else
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int u = n; u < alln; u++) {
|
for (int u = n; u < alln; u++) {
|
||||||
int which;
|
int which;
|
||||||
@ -566,12 +582,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||||||
n2 = pack_offset + maxnbors;
|
n2 = pack_offset + maxnbors;
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
|
||||||
#ifdef LMP_INTEL_NBOR_COMPAT
|
#ifdef LMP_INTEL_NBOR_COMPAT
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#else
|
#else
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int u = n2; u < alln; u++) {
|
for (int u = n2; u < alln; u++) {
|
||||||
int which;
|
int which;
|
||||||
@ -737,8 +757,14 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||||||
int jnum = numneigh[i];
|
int jnum = numneigh[i];
|
||||||
if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
|
if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
|
||||||
#if __INTEL_COMPILER+0 > 1499
|
#if __INTEL_COMPILER+0 > 1499
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(max:vlmax,vgmax) \
|
||||||
|
reduction(min:vlmin, vgmin)
|
||||||
|
#else
|
||||||
|
#pragma simd reduction(max:vlmax,vgmax) \
|
||||||
|
reduction(min:vlmin, vgmin)
|
||||||
|
#endif
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
|
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
const int j = jlist[jj] & NEIGHMASK;
|
const int j = jlist[jj] & NEIGHMASK;
|
||||||
@ -782,8 +808,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||||||
int jnum = numneigh[i];
|
int jnum = numneigh[i];
|
||||||
if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
|
if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
|
||||||
int jj = 0;
|
int jj = 0;
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
for (jj = 0; jj < jnum; jj++) {
|
for (jj = 0; jj < jnum; jj++) {
|
||||||
const int which = jlist[jj] >> SBBITS & 3;
|
const int which = jlist[jj] >> SBBITS & 3;
|
||||||
const int j = jlist[jj] & NEIGHMASK;
|
const int j = jlist[jj] & NEIGHMASK;
|
||||||
|
|||||||
@ -248,12 +248,18 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
|||||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag == VIRIAL_PAIR)
|
||||||
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
flt_t forcecoul, forcebuck, evdwl, ecoul;
|
flt_t forcecoul, forcebuck, evdwl, ecoul;
|
||||||
|
|||||||
@ -309,9 +309,14 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
|
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
|
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < ej; jj++) {
|
for (int jj = 0; jj < ej; jj++) {
|
||||||
flt_t forcecoul, forcebuck, evdwl, ecoul;
|
flt_t forcecoul, forcebuck, evdwl, ecoul;
|
||||||
|
|||||||
@ -230,12 +230,18 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
|||||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag == VIRIAL_PAIR)
|
||||||
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
|
|
||||||
|
|||||||
@ -289,9 +289,14 @@ void PairDPDIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
flt_t forcelj, evdwl;
|
flt_t forcelj, evdwl;
|
||||||
|
|||||||
@ -327,8 +327,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:rhoi)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:rhoi)
|
#pragma simd reduction(+:rhoi)
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < ej; jj++) {
|
for (int jj = 0; jj < ej; jj++) {
|
||||||
int jtype;
|
int jtype;
|
||||||
@ -369,23 +373,35 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
const int rcount = nall;
|
const int rcount = nall;
|
||||||
if (nthreads == 2) {
|
if (nthreads == 2) {
|
||||||
double *trho2 = rho + nmax;
|
double *trho2 = rho + nmax;
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
for (int n = 0; n < rcount; n++)
|
for (int n = 0; n < rcount; n++)
|
||||||
rho[n] += trho2[n];
|
rho[n] += trho2[n];
|
||||||
} else if (nthreads == 4) {
|
} else if (nthreads == 4) {
|
||||||
double *trho2 = rho + nmax;
|
double *trho2 = rho + nmax;
|
||||||
double *trho3 = trho2 + nmax;
|
double *trho3 = trho2 + nmax;
|
||||||
double *trho4 = trho3 + nmax;
|
double *trho4 = trho3 + nmax;
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
for (int n = 0; n < rcount; n++)
|
for (int n = 0; n < rcount; n++)
|
||||||
rho[n] += trho2[n] + trho3[n] + trho4[n];
|
rho[n] += trho2[n] + trho3[n] + trho4[n];
|
||||||
} else {
|
} else {
|
||||||
double *trhon = rho + nmax;
|
double *trhon = rho + nmax;
|
||||||
for (int t = 1; t < nthreads; t++) {
|
for (int t = 1; t < nthreads; t++) {
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
for (int n = 0; n < rcount; n++)
|
for (int n = 0; n < rcount; n++)
|
||||||
rho[n] += trhon[n];
|
rho[n] += trhon[n];
|
||||||
trhon += nmax;
|
trhon += nmax;
|
||||||
@ -414,8 +430,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
if (EFLAG) tevdwl = (acc_t)0.0;
|
if (EFLAG) tevdwl = (acc_t)0.0;
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:tevdwl)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:tevdwl)
|
#pragma simd reduction(+:tevdwl)
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int ii = iifrom; ii < iito; ++ii) {
|
for (int ii = iifrom; ii < iito; ++ii) {
|
||||||
const int i = ilist[ii];
|
const int i = ilist[ii];
|
||||||
@ -510,9 +530,14 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < ej; jj++) {
|
for (int jj = 0; jj < ej; jj++) {
|
||||||
int jtype;
|
int jtype;
|
||||||
|
|||||||
@ -449,9 +449,14 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
|
__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
|
||||||
#endif
|
#endif
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp, \
|
||||||
|
t3tmp,sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
|
||||||
|
#else
|
||||||
|
#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp, \
|
||||||
|
t3tmp,sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
|
||||||
|
#endif
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
|
|
||||||
sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
|
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < packed_j; jj++) {
|
for (int jj = 0; jj < packed_j; jj++) {
|
||||||
flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
|
flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
|
||||||
@ -806,8 +811,12 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
acc_t *f_scalar2 = f_scalar + fst4;
|
acc_t *f_scalar2 = f_scalar + fst4;
|
||||||
for (int t = 1; t < nthreads; t++) {
|
for (int t = 1; t < nthreads; t++) {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int n = iifrom * 8; n < sto; n++)
|
for (int n = iifrom * 8; n < sto; n++)
|
||||||
f_scalar[n] += f_scalar2[n];
|
f_scalar[n] += f_scalar2[n];
|
||||||
|
|||||||
@ -294,9 +294,14 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
|
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
|
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < ej; jj++) {
|
for (int jj = 0; jj < ej; jj++) {
|
||||||
flt_t forcecoul, forcelj, evdwl;
|
flt_t forcecoul, forcelj, evdwl;
|
||||||
|
|||||||
@ -314,9 +314,14 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
|
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
|
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < ej; jj++) {
|
for (int jj = 0; jj < ej; jj++) {
|
||||||
flt_t forcecoul, forcelj, evdwl, ecoul;
|
flt_t forcecoul, forcelj, evdwl, ecoul;
|
||||||
|
|||||||
@ -305,9 +305,14 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
|
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#else
|
||||||
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
|
secoul, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#endif
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < ej; jj++) {
|
for (int jj = 0; jj < ej; jj++) {
|
||||||
flt_t forcecoul, forcelj, evdwl, ecoul;
|
flt_t forcecoul, forcelj, evdwl, ecoul;
|
||||||
|
|||||||
@ -241,9 +241,15 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
|
sv0, sv1, sv2, sv3, sv4, sv5) \
|
||||||
|
aligned(jlist,x,ljc12oi,special_lj,f,lj34i:64)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
#pragma vector aligned
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
flt_t forcelj, evdwl;
|
flt_t forcelj, evdwl;
|
||||||
|
|||||||
@ -371,8 +371,12 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < ejnum_pad; jj++) {
|
for (int jj = 0; jj < ejnum_pad; jj++) {
|
||||||
acc_t fjxtmp, fjytmp, fjztmp, fjtmp;
|
acc_t fjxtmp, fjytmp, fjztmp, fjtmp;
|
||||||
|
|||||||
@ -770,8 +770,12 @@ void PPPMDispIntel::particle_map(double delx, double dely, double delz,
|
|||||||
IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
|
IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:flag)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:flag)
|
#pragma simd reduction(+:flag)
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = iifrom; i < iito; i++) {
|
for (int i = iifrom; i < iito; i++) {
|
||||||
|
|
||||||
@ -876,7 +880,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||||
int idz = dz;
|
int idz = dz;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho[0][k] = rho_lookup[idx][k];
|
rho[0][k] = rho_lookup[idx][k];
|
||||||
@ -885,7 +893,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower; k <= nupper; k++) {
|
for (int k = nlower; k <= nupper; k++) {
|
||||||
FFT_SCALAR r1,r2,r3;
|
FFT_SCALAR r1,r2,r3;
|
||||||
@ -917,8 +929,12 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
int mzy = m*nix + mz;
|
int mzy = m*nix + mz;
|
||||||
FFT_SCALAR x0 = y0*rho[1][m];
|
FFT_SCALAR x0 = y0*rho[1][m];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < order; l++) {
|
for (int l = 0; l < order; l++) {
|
||||||
int mzyx = l + mzy;
|
int mzyx = l + mzy;
|
||||||
@ -939,7 +955,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
|
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
for (int j = 1; j < nthr; j++) {
|
for (int j = 1; j < nthr; j++) {
|
||||||
@ -1025,7 +1045,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||||
int idz = dz;
|
int idz = dz;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho[0][k] = rho6_lookup[idx][k];
|
rho[0][k] = rho6_lookup[idx][k];
|
||||||
@ -1034,7 +1058,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||||
FFT_SCALAR r1,r2,r3;
|
FFT_SCALAR r1,r2,r3;
|
||||||
@ -1067,8 +1095,12 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
int mzy = m*nix + mz;
|
int mzy = m*nix + mz;
|
||||||
FFT_SCALAR x0 = y0*rho[1][m];
|
FFT_SCALAR x0 = y0*rho[1][m];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < order; l++) {
|
for (int l = 0; l < order; l++) {
|
||||||
int mzyx = l + mzy;
|
int mzyx = l + mzy;
|
||||||
@ -1089,7 +1121,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr);
|
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr);
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
for (int j = 1; j < nthr; j++) {
|
for (int j = 1; j < nthr; j++) {
|
||||||
@ -1173,7 +1209,11 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||||
int idz = dz;
|
int idz = dz;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho[0][k] = rho6_lookup[idx][k];
|
rho[0][k] = rho6_lookup[idx][k];
|
||||||
@ -1182,7 +1222,11 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||||
FFT_SCALAR r1,r2,r3;
|
FFT_SCALAR r1,r2,r3;
|
||||||
@ -1215,8 +1259,12 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
int my = m + nysum;
|
int my = m + nysum;
|
||||||
FFT_SCALAR x0 = y0*rho[1][m];
|
FFT_SCALAR x0 = y0*rho[1][m];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < order; l++) {
|
for (int l = 0; l < order; l++) {
|
||||||
int mx = l + nxsum;
|
int mx = l + nxsum;
|
||||||
@ -1307,7 +1355,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||||
int idz = dz;
|
int idz = dz;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho[0][k] = rho6_lookup[idx][k];
|
rho[0][k] = rho6_lookup[idx][k];
|
||||||
@ -1316,7 +1368,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||||
FFT_SCALAR r1,r2,r3;
|
FFT_SCALAR r1,r2,r3;
|
||||||
@ -1349,8 +1405,12 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
int mzy = m*nix + mz;
|
int mzy = m*nix + mz;
|
||||||
FFT_SCALAR x0 = y0*rho[1][m];
|
FFT_SCALAR x0 = y0*rho[1][m];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < order; l++) {
|
for (int l = 0; l < order; l++) {
|
||||||
int mzyx = l + mzy;
|
int mzyx = l + mzy;
|
||||||
@ -1373,7 +1433,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr);
|
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr);
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
for (int j = 1; j < nthr; j++) {
|
for (int j = 1; j < nthr; j++) {
|
||||||
@ -1454,7 +1518,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||||
int idz = dz;
|
int idz = dz;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho0[k] = rho_lookup[idx][k];
|
rho0[k] = rho_lookup[idx][k];
|
||||||
@ -1463,7 +1531,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower; k <= nupper; k++) {
|
for (int k = nlower; k <= nupper; k++) {
|
||||||
FFT_SCALAR r1 = rho_coeff[order-1][k];
|
FFT_SCALAR r1 = rho_coeff[order-1][k];
|
||||||
@ -1498,8 +1570,12 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
int my = m+nysum;
|
int my = m+nysum;
|
||||||
FFT_SCALAR y0 = z0*rho1[m];
|
FFT_SCALAR y0 = z0*rho1[m];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < order; l++) {
|
for (int l = 0; l < order; l++) {
|
||||||
int mx = l+nxsum;
|
int mx = l+nxsum;
|
||||||
@ -1624,7 +1700,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
int idz = dz;
|
int idz = dz;
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho[0][k] = rho_lookup[idx][k];
|
rho[0][k] = rho_lookup[idx][k];
|
||||||
@ -1636,7 +1716,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower; k <= nupper; k++) {
|
for (int k = nlower; k <= nupper; k++) {
|
||||||
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
||||||
@ -1680,8 +1764,12 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
||||||
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < order; l++) {
|
for (int l = 0; l < order; l++) {
|
||||||
int mx = l + nxsum;
|
int mx = l + nxsum;
|
||||||
@ -1702,7 +1790,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
particle_ekx[i] *= hx_inv;
|
particle_ekx[i] *= hx_inv;
|
||||||
@ -1802,7 +1894,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||||
int idz = dz;
|
int idz = dz;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho0[k] = rho6_lookup[idx][k];
|
rho0[k] = rho6_lookup[idx][k];
|
||||||
@ -1811,7 +1907,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||||
FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
|
FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
|
||||||
@ -1846,8 +1946,12 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
int my = m+nysum;
|
int my = m+nysum;
|
||||||
FFT_SCALAR y0 = z0*rho1[m];
|
FFT_SCALAR y0 = z0*rho1[m];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < order; l++) {
|
for (int l = 0; l < order; l++) {
|
||||||
int mx = l+nxsum;
|
int mx = l+nxsum;
|
||||||
@ -1967,7 +2071,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
int idz = dz;
|
int idz = dz;
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho[0][k] = rho6_lookup[idx][k];
|
rho[0][k] = rho6_lookup[idx][k];
|
||||||
@ -1979,7 +2087,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||||
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
||||||
@ -2023,8 +2135,12 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
||||||
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < order; l++) {
|
for (int l = 0; l < order; l++) {
|
||||||
int mx = l + nxsum;
|
int mx = l + nxsum;
|
||||||
@ -2045,7 +2161,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
particle_ekx[i] *= hx_inv;
|
particle_ekx[i] *= hx_inv;
|
||||||
@ -2143,7 +2263,11 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||||
int idz = dz;
|
int idz = dz;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho0[k] = rho6_lookup[idx][k];
|
rho0[k] = rho6_lookup[idx][k];
|
||||||
@ -2152,7 +2276,11 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||||
FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
|
FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
|
||||||
@ -2206,8 +2334,12 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
int my = m+nysum;
|
int my = m+nysum;
|
||||||
FFT_SCALAR y0 = z0*rho1[m];
|
FFT_SCALAR y0 = z0*rho1[m];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < order; l++) {
|
for (int l = 0; l < order; l++) {
|
||||||
int mx = l+nxsum;
|
int mx = l+nxsum;
|
||||||
@ -2398,7 +2530,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
int idz = dz;
|
int idz = dz;
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho[0][k] = rho6_lookup[idx][k];
|
rho[0][k] = rho6_lookup[idx][k];
|
||||||
@ -2410,7 +2546,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||||
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
||||||
@ -2479,8 +2619,12 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
||||||
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < order; l++) {
|
for (int l = 0; l < order; l++) {
|
||||||
int mx = l + nxsum;
|
int mx = l + nxsum;
|
||||||
@ -2541,7 +2685,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
particle_ekx0[i] *= hx_inv;
|
particle_ekx0[i] *= hx_inv;
|
||||||
@ -2671,7 +2819,11 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||||
int idz = dz;
|
int idz = dz;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho0[k] = rho6_lookup[idx][k];
|
rho0[k] = rho6_lookup[idx][k];
|
||||||
@ -2680,7 +2832,11 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||||
FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
|
FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
|
||||||
@ -2721,8 +2877,12 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
int my = m+nysum;
|
int my = m+nysum;
|
||||||
FFT_SCALAR y0 = z0*rho1[m];
|
FFT_SCALAR y0 = z0*rho1[m];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < order; l++) {
|
for (int l = 0; l < order; l++) {
|
||||||
int mx = l+nxsum;
|
int mx = l+nxsum;
|
||||||
@ -2848,7 +3008,11 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
int idz = dz;
|
int idz = dz;
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho[0][k] = rho6_lookup[idx][k];
|
rho[0][k] = rho6_lookup[idx][k];
|
||||||
@ -2860,7 +3024,11 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower_6; k <= nupper_6; k++) {
|
for (int k = nlower_6; k <= nupper_6; k++) {
|
||||||
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
||||||
@ -2909,8 +3077,12 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> * /*buffers*/)
|
|||||||
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
||||||
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < order; l++) {
|
for (int l = 0; l < order; l++) {
|
||||||
int mx = l + nxsum;
|
int mx = l + nxsum;
|
||||||
@ -2992,7 +3164,11 @@ void PPPMDispIntel::precompute_rho()
|
|||||||
for (int i = 0; i < rho_points; i++) {
|
for (int i = 0; i < rho_points; i++) {
|
||||||
FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
|
FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k=nlower; k<=nupper;k++) {
|
for (int k=nlower; k<=nupper;k++) {
|
||||||
FFT_SCALAR r1 = ZEROF;
|
FFT_SCALAR r1 = ZEROF;
|
||||||
@ -3006,7 +3182,11 @@ void PPPMDispIntel::precompute_rho()
|
|||||||
}
|
}
|
||||||
if (differentiation_flag == 1) {
|
if (differentiation_flag == 1) {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k=nlower; k<=nupper;k++) {
|
for (int k=nlower; k<=nupper;k++) {
|
||||||
FFT_SCALAR r1 = ZEROF;
|
FFT_SCALAR r1 = ZEROF;
|
||||||
@ -3026,7 +3206,11 @@ void PPPMDispIntel::precompute_rho()
|
|||||||
for (int i = 0; i < rho_points; i++) {
|
for (int i = 0; i < rho_points; i++) {
|
||||||
FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
|
FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k=nlower_6; k<=nupper_6;k++) {
|
for (int k=nlower_6; k<=nupper_6;k++) {
|
||||||
FFT_SCALAR r1 = ZEROF;
|
FFT_SCALAR r1 = ZEROF;
|
||||||
@ -3040,7 +3224,11 @@ void PPPMDispIntel::precompute_rho()
|
|||||||
}
|
}
|
||||||
if (differentiation_flag == 1) {
|
if (differentiation_flag == 1) {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k=nlower_6; k<=nupper_6;k++) {
|
for (int k=nlower_6; k<=nupper_6;k++) {
|
||||||
FFT_SCALAR r1 = ZEROF;
|
FFT_SCALAR r1 = ZEROF;
|
||||||
|
|||||||
@ -394,8 +394,12 @@ void PPPMIntel::particle_map(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
|
IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd reduction(+:flag)
|
||||||
|
#else
|
||||||
#pragma simd reduction(+:flag)
|
#pragma simd reduction(+:flag)
|
||||||
|
#endif
|
||||||
|
#pragma vector aligned
|
||||||
#endif
|
#endif
|
||||||
for (int i = iifrom; i < iito; i++) {
|
for (int i = iifrom; i < iito; i++) {
|
||||||
|
|
||||||
@ -500,7 +504,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||||
int idz = dz;
|
int idz = dz;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho[0][k] = rho_lookup[idx][k];
|
rho[0][k] = rho_lookup[idx][k];
|
||||||
@ -509,7 +517,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower; k <= nupper; k++) {
|
for (int k = nlower; k <= nupper; k++) {
|
||||||
FFT_SCALAR r1,r2,r3;
|
FFT_SCALAR r1,r2,r3;
|
||||||
@ -541,7 +553,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
int mzy = m*nix + mz;
|
int mzy = m*nix + mz;
|
||||||
FFT_SCALAR x0 = y0*rho[1][m];
|
FFT_SCALAR x0 = y0*rho[1][m];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||||
int mzyx = l + mzy;
|
int mzyx = l + mzy;
|
||||||
@ -563,7 +579,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
|
IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
for (int j = 1; j < nthr; j++) {
|
for (int j = 1; j < nthr; j++) {
|
||||||
@ -645,7 +665,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||||
int idz = dz;
|
int idz = dz;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho0[k] = rho_lookup[idx][k];
|
rho0[k] = rho_lookup[idx][k];
|
||||||
@ -654,7 +678,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower; k <= nupper; k++) {
|
for (int k = nlower; k <= nupper; k++) {
|
||||||
FFT_SCALAR r1 = rho_coeff[order-1][k];
|
FFT_SCALAR r1 = rho_coeff[order-1][k];
|
||||||
@ -690,7 +718,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
int my = m+nysum;
|
int my = m+nysum;
|
||||||
FFT_SCALAR y0 = z0*rho1[m];
|
FFT_SCALAR y0 = z0*rho1[m];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||||
int mx = l+nxsum;
|
int mx = l+nxsum;
|
||||||
@ -813,7 +845,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
dz = dz*half_rho_scale + half_rho_scale_plus;
|
dz = dz*half_rho_scale + half_rho_scale_plus;
|
||||||
int idz = dz;
|
int idz = dz;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho[0][k] = rho_lookup[idx][k];
|
rho[0][k] = rho_lookup[idx][k];
|
||||||
@ -825,7 +861,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower; k <= nupper; k++) {
|
for (int k = nlower; k <= nupper; k++) {
|
||||||
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
||||||
@ -871,7 +911,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
||||||
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||||
int mx = l + nxsum;
|
int mx = l + nxsum;
|
||||||
@ -893,7 +937,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
particle_ekx[i] *= hx_inv;
|
particle_ekx[i] *= hx_inv;
|
||||||
@ -942,7 +990,11 @@ void PPPMIntel::precompute_rho()
|
|||||||
for (int i = 0; i < rho_points; i++) {
|
for (int i = 0; i < rho_points; i++) {
|
||||||
FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
|
FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k=nlower; k<=nupper;k++) {
|
for (int k=nlower; k<=nupper;k++) {
|
||||||
FFT_SCALAR r1 = ZEROF;
|
FFT_SCALAR r1 = ZEROF;
|
||||||
@ -956,7 +1008,11 @@ void PPPMIntel::precompute_rho()
|
|||||||
}
|
}
|
||||||
if (differentiation_flag == 1) {
|
if (differentiation_flag == 1) {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#if defined(USE_OMP_SIMD)
|
||||||
|
#pragma omp simd
|
||||||
|
#else
|
||||||
#pragma simd
|
#pragma simd
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (int k=nlower; k<=nupper;k++) {
|
for (int k=nlower; k<=nupper;k++) {
|
||||||
FFT_SCALAR r1 = ZEROF;
|
FFT_SCALAR r1 = ZEROF;
|
||||||
|
|||||||
Reference in New Issue
Block a user