From fefcd0e2af255056a0dc49a1604e02b9fa92a8c8 Mon Sep 17 00:00:00 2001 From: Mike Brown Date: Mon, 26 Jul 2021 11:22:21 -0700 Subject: [PATCH] Converting cilk vectorization directives to the openmp standard and changing some more depracated vector intrinsics. Data alignment directives for compiler vectorization are still mostly intel specific. --- src/INTEL/angle_charmm_intel.cpp | 8 + src/INTEL/angle_harmonic_intel.cpp | 8 + src/INTEL/bond_fene_intel.cpp | 8 + src/INTEL/bond_harmonic_intel.cpp | 8 + src/INTEL/dihedral_charmm_intel.cpp | 19 +- src/INTEL/dihedral_fourier_intel.cpp | 8 + src/INTEL/dihedral_harmonic_intel.cpp | 8 + src/INTEL/dihedral_opls_intel.cpp | 8 + src/INTEL/fix_intel.cpp | 32 ++- src/INTEL/fix_nh_intel.cpp | 66 ++++- src/INTEL/fix_nve_asphere_intel.cpp | 24 +- src/INTEL/fix_nve_intel.cpp | 36 ++- src/INTEL/improper_cvff_intel.cpp | 12 + src/INTEL/improper_harmonic_intel.cpp | 8 + src/INTEL/intel_intrinsics.h | 3 +- src/INTEL/intel_intrinsics_airebo.h | 36 +-- src/INTEL/intel_preprocess.h | 200 +++++++++++++++ src/INTEL/intel_simd.h | 239 +++++++++++------- src/INTEL/npair_full_bin_ghost_intel.cpp | 26 +- src/INTEL/npair_intel.cpp | 40 ++- src/INTEL/pair_buck_coul_cut_intel.cpp | 10 +- src/INTEL/pair_buck_coul_long_intel.cpp | 9 +- src/INTEL/pair_buck_intel.cpp | 10 +- src/INTEL/pair_dpd_intel.cpp | 9 +- src/INTEL/pair_eam_intel.cpp | 39 ++- src/INTEL/pair_gayberne_intel.cpp | 15 +- .../pair_lj_charmm_coul_charmm_intel.cpp | 9 +- src/INTEL/pair_lj_charmm_coul_long_intel.cpp | 9 +- src/INTEL/pair_lj_cut_coul_long_intel.cpp | 9 +- src/INTEL/pair_lj_cut_intel.cpp | 8 +- src/INTEL/pair_sw_intel.cpp | 6 +- src/INTEL/pppm_disp_intel.cpp | 214 +++++++++++++++- src/INTEL/pppm_intel.cpp | 58 ++++- 33 files changed, 1013 insertions(+), 189 deletions(-) diff --git a/src/INTEL/angle_charmm_intel.cpp b/src/INTEL/angle_charmm_intel.cpp index 29b7ec208b..26943934be 100644 --- a/src/INTEL/angle_charmm_intel.cpp +++ b/src/INTEL/angle_charmm_intel.cpp @@ -162,7 +162,11 @@ void AngleCharmmIntel::eval(const int vflag, if (VFLAG && vflag) { sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; } +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5) +#else #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5) +#endif for (int n = nfrom; n < nto; n ++) { #else for (int n = nfrom; n < nto; n += npl) { @@ -246,7 +250,11 @@ void AngleCharmmIntel::eval(const int vflag, // apply force to each of 3 atoms #ifdef LMP_INTEL_USE_SIMDOFF +#if defined(USE_OMP_SIMD) + #pragma omp ordered simd +#else #pragma simdoff +#endif #endif { if (NEWTON_BOND || i1 < nlocal) { diff --git a/src/INTEL/angle_harmonic_intel.cpp b/src/INTEL/angle_harmonic_intel.cpp index a2d8cc7d13..e392730edc 100644 --- a/src/INTEL/angle_harmonic_intel.cpp +++ b/src/INTEL/angle_harmonic_intel.cpp @@ -162,7 +162,11 @@ void AngleHarmonicIntel::eval(const int vflag, if (VFLAG && vflag) { sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; } +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5) +#else #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5) +#endif for (int n = nfrom; n < nto; n ++) { #else for (int n = nfrom; n < nto; n += npl) { @@ -228,7 +232,11 @@ void AngleHarmonicIntel::eval(const int vflag, // apply force to each of 3 atoms #ifdef LMP_INTEL_USE_SIMDOFF +#if defined(USE_OMP_SIMD) + #pragma omp ordered simd +#else #pragma simdoff +#endif #endif { if (NEWTON_BOND || i1 < nlocal) { diff --git a/src/INTEL/bond_fene_intel.cpp b/src/INTEL/bond_fene_intel.cpp index 44a8c0d3cf..1ab8da68d9 100644 --- a/src/INTEL/bond_fene_intel.cpp +++ b/src/INTEL/bond_fene_intel.cpp @@ -158,7 +158,11 @@ void BondFENEIntel::eval(const int vflag, if (VFLAG && vflag) { sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; } +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5) +#else #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5) +#endif for (int n = nfrom; n < nto; n ++) { #else for (int n = nfrom; n < nto; n += npl) { @@ -215,7 +219,11 @@ void BondFENEIntel::eval(const int vflag, // apply force to each of 2 atoms #ifdef LMP_INTEL_USE_SIMDOFF +#if defined(USE_OMP_SIMD) + #pragma omp ordered simd +#else #pragma simdoff +#endif #endif { if (NEWTON_BOND || i1 < nlocal) { diff --git a/src/INTEL/bond_harmonic_intel.cpp b/src/INTEL/bond_harmonic_intel.cpp index a37ae091a0..35b194f0fa 100644 --- a/src/INTEL/bond_harmonic_intel.cpp +++ b/src/INTEL/bond_harmonic_intel.cpp @@ -155,7 +155,11 @@ void BondHarmonicIntel::eval(const int vflag, if (VFLAG && vflag) { sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; } +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5) +#else #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5) +#endif for (int n = nfrom; n < nto; n ++) { #else for (int n = nfrom; n < nto; n += npl) { @@ -184,7 +188,11 @@ void BondHarmonicIntel::eval(const int vflag, // apply force to each of 2 atoms #ifdef LMP_INTEL_USE_SIMDOFF +#if defined(USE_OMP_SIMD) + #pragma omp ordered simd +#else #pragma simdoff +#endif #endif { if (NEWTON_BOND || i1 < nlocal) { diff --git a/src/INTEL/dihedral_charmm_intel.cpp b/src/INTEL/dihedral_charmm_intel.cpp index a317be00fb..4116d9134f 100644 --- a/src/INTEL/dihedral_charmm_intel.cpp +++ b/src/INTEL/dihedral_charmm_intel.cpp @@ -181,9 +181,16 @@ void DihedralCharmmIntel::eval(const int vflag, } #if defined(LMP_SIMD_COMPILER_TEST) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \ + sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, \ + spv5) +#else #pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \ - sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5) + sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, \ + spv5) +#endif + #pragma vector aligned for (int n = nfrom; n < nto; n++) { #endif for (int n = nfrom; n < nto; n += npl) { @@ -329,7 +336,11 @@ void DihedralCharmmIntel::eval(const int vflag, #if defined(LMP_SIMD_COMPILER_TEST) +#if defined(USE_OMP_SIMD) + #pragma omp ordered simd +#else #pragma simdoff +#endif #endif { if (NEWTON_BOND || i2 < nlocal) { @@ -408,7 +419,11 @@ void DihedralCharmmIntel::eval(const int vflag, // apply force to each of 4 atoms #if defined(LMP_SIMD_COMPILER_TEST) +#if defined(USE_OMP_SIMD) + #pragma omp ordered simd +#else #pragma simdoff +#endif #endif { if (NEWTON_BOND || i1 < nlocal) { diff --git a/src/INTEL/dihedral_fourier_intel.cpp b/src/INTEL/dihedral_fourier_intel.cpp index 4d44ea36d2..d952ac7506 100644 --- a/src/INTEL/dihedral_fourier_intel.cpp +++ b/src/INTEL/dihedral_fourier_intel.cpp @@ -154,7 +154,11 @@ void DihedralFourierIntel::eval(const int vflag, if (VFLAG && vflag) { sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; } +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5) +#else #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5) +#endif for (int n = nfrom; n < nto; n ++) { #else for (int n = nfrom; n < nto; n += npl) { @@ -304,7 +308,11 @@ void DihedralFourierIntel::eval(const int vflag, } #ifdef LMP_INTEL_USE_SIMDOFF +#if defined(USE_OMP_SIMD) + #pragma omp ordered simd +#else #pragma simdoff +#endif #endif { if (NEWTON_BOND || i1 < nlocal) { diff --git a/src/INTEL/dihedral_harmonic_intel.cpp b/src/INTEL/dihedral_harmonic_intel.cpp index f7009689c7..df9304b6ba 100644 --- a/src/INTEL/dihedral_harmonic_intel.cpp +++ b/src/INTEL/dihedral_harmonic_intel.cpp @@ -154,7 +154,11 @@ void DihedralHarmonicIntel::eval(const int vflag, if (VFLAG && vflag) { sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; } +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5) +#else #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5) +#endif for (int n = nfrom; n < nto; n ++) { #else for (int n = nfrom; n < nto; n += npl) { @@ -299,7 +303,11 @@ void DihedralHarmonicIntel::eval(const int vflag, } #ifdef LMP_INTEL_USE_SIMDOFF +#if defined(USE_OMP_SIMD) + #pragma omp ordered simd +#else #pragma simdoff +#endif #endif { if (NEWTON_BOND || i1 < nlocal) { diff --git a/src/INTEL/dihedral_opls_intel.cpp b/src/INTEL/dihedral_opls_intel.cpp index ab007dad8c..89f06773d5 100644 --- a/src/INTEL/dihedral_opls_intel.cpp +++ b/src/INTEL/dihedral_opls_intel.cpp @@ -158,7 +158,11 @@ void DihedralOPLSIntel::eval(const int vflag, if (VFLAG && vflag) { sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; } +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5) +#else #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5) +#endif for (int n = nfrom; n < nto; n ++) { #else for (int n = nfrom; n < nto; n += npl) { @@ -319,7 +323,11 @@ void DihedralOPLSIntel::eval(const int vflag, } #ifdef LMP_INTEL_USE_SIMDOFF +#if defined(USE_OMP_SIMD) + #pragma omp ordered simd +#else #pragma simdoff +#endif #endif { if (NEWTON_BOND || i1 < nlocal) { diff --git a/src/INTEL/fix_intel.cpp b/src/INTEL/fix_intel.cpp index 59eea4961a..d0633d7791 100644 --- a/src/INTEL/fix_intel.cpp +++ b/src/INTEL/fix_intel.cpp @@ -635,19 +635,31 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar) if (_nthreads == 4) { acc_t *f_scalar3 = f_scalar2 + f_stride4; acc_t *f_scalar4 = f_scalar3 + f_stride4; - _use_simd_pragma("vector aligned") - _use_simd_pragma("simd") + #if defined(USE_OMP_SIMD) + #pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4:64) + #elif defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd + #endif for (int n = 0; n < o_range; n++) f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; } else if (_nthreads == 2) { - _use_simd_pragma("vector aligned") - _use_simd_pragma("simd") + #if defined(USE_OMP_SIMD) + #pragma omp simd aligned(f_scalar,f_scalar2:64) + #elif defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd + #endif for (int n = 0; n < o_range; n++) f_scalar[n] += f_scalar2[n]; } else { acc_t *f_scalar3 = f_scalar2 + f_stride4; - _use_simd_pragma("vector aligned") - _use_simd_pragma("simd") + #if defined(USE_OMP_SIMD) + #pragma omp simd aligned(f_scalar,f_scalar2,f_scalar3:64) + #elif defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd + #endif for (int n = 0; n < o_range; n++) f_scalar[n] += f_scalar2[n] + f_scalar3[n]; } @@ -662,8 +674,12 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar) acc_t *f_scalar2 = f_scalar + f_stride4; for (int t = 1; t < _nthreads; t++) { - _use_simd_pragma("vector aligned") - _use_simd_pragma("simd") + #if defined(USE_OMP_SIMD) + #pragma omp simd aligned(f_scalar,f_scalar2:64) + #elif defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd + #endif for (int n = iifrom; n < iito; n++) f_scalar[n] += f_scalar2[n]; f_scalar2 += f_stride4; diff --git a/src/INTEL/fix_nh_intel.cpp b/src/INTEL/fix_nh_intel.cpp index 5370e3a13f..a4fdecbd96 100644 --- a/src/INTEL/fix_nh_intel.cpp +++ b/src/INTEL/fix_nh_intel.cpp @@ -99,8 +99,12 @@ void FixNHIntel::remap() if (allremap) { #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < nlocal; i++) { const double d0 = x[i].x - b0; @@ -112,8 +116,12 @@ void FixNHIntel::remap() } } else { #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < nlocal; i++) { if (mask[i] & dilate_group_bit) { @@ -278,8 +286,12 @@ void FixNHIntel::remap() if (allremap) { #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < nlocal; i++) { x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0; @@ -288,8 +300,12 @@ void FixNHIntel::remap() } } else { #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < nlocal; i++) { if (mask[i] & dilate_group_bit) { @@ -415,8 +431,12 @@ void FixNHIntel::nh_v_press() if (igroup == 0) { #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < nlocal; i++) { v[i].x *= f0; @@ -425,8 +445,12 @@ void FixNHIntel::nh_v_press() } } else { #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < nlocal; i++) { if (mask[i] & groupbit) { @@ -448,8 +472,12 @@ void FixNHIntel::nve_v() double * _noalias const v = atom->v[0]; const double * _noalias const f = atom->f[0]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < _nlocal3; i++) v[i] += _dtfm[i] * f[i]; @@ -468,15 +496,23 @@ void FixNHIntel::nve_x() if (igroup == 0) { #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < _nlocal3; i++) x[i] += dtv * v[i]; } else { #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < _nlocal3; i++) { if (_dtfm[i] != 0.0) @@ -500,15 +536,23 @@ void FixNHIntel::nh_v_temp() if (igroup == 0) { #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < _nlocal3; i++) v[i] *= factor_eta; } else { #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < _nlocal3; i++) { if (_dtfm[i] != 0.0) diff --git a/src/INTEL/fix_nve_asphere_intel.cpp b/src/INTEL/fix_nve_asphere_intel.cpp index 78504c237a..eda8b48a67 100644 --- a/src/INTEL/fix_nve_asphere_intel.cpp +++ b/src/INTEL/fix_nve_asphere_intel.cpp @@ -97,8 +97,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/) dtq = 0.5 * dtv; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < _nlocal3; i++) { v[i] += _dtfm[i] * f[i]; @@ -108,8 +112,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/) // update angular momentum by 1/2 step if (igroup == 0) { #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < nlocal; i++) { double *quat = bonus[ellipsoid[i]].quat; @@ -118,8 +126,12 @@ void FixNVEAsphereIntel::initial_integrate(int /*vflag*/) } } else { #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < nlocal; i++) { if (mask[i] & groupbit) { @@ -143,8 +155,12 @@ void FixNVEAsphereIntel::final_integrate() const double * _noalias const torque = atom->torque[0]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < _nlocal3; i++) { v[i] += _dtfm[i] * f[i]; diff --git a/src/INTEL/fix_nve_intel.cpp b/src/INTEL/fix_nve_intel.cpp index fb90946da0..9670af65c2 100644 --- a/src/INTEL/fix_nve_intel.cpp +++ b/src/INTEL/fix_nve_intel.cpp @@ -68,8 +68,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/) if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) { const double dtfm = dtf / atom->mass[1]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < _nlocal3; i++) { v[i] += dtfm * f[i]; @@ -78,8 +82,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/) } else if (igroup == 0) { if (neighbor->ago == 0) reset_dt(); #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < _nlocal3; i++) { v[i] += _dtfm[i] * f[i]; @@ -88,8 +96,12 @@ void FixNVEIntel::initial_integrate(int /*vflag*/) } else { if (neighbor->ago == 0) reset_dt(); #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < _nlocal3; i++) { if (_dtfm[i] != 0.0) { @@ -112,16 +124,24 @@ void FixNVEIntel::final_integrate() _nlocal3 = 3 * atom->nlocal; const double dtfm = dtf / atom->mass[1]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < _nlocal3; i++) v[i] += dtfm * f[i]; } else if (igroup == 0) { if (neighbor->ago == 0) reset_dt(); #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < _nlocal3; i++) { v[i] += _dtfm[i] * f[i]; @@ -129,8 +149,12 @@ void FixNVEIntel::final_integrate() } else { if (neighbor->ago == 0) reset_dt(); #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int i = 0; i < _nlocal3; i++) v[i] += _dtfm[i] * f[i]; diff --git a/src/INTEL/improper_cvff_intel.cpp b/src/INTEL/improper_cvff_intel.cpp index 62dcde36b9..4d473de7aa 100644 --- a/src/INTEL/improper_cvff_intel.cpp +++ b/src/INTEL/improper_cvff_intel.cpp @@ -165,7 +165,11 @@ void ImproperCvffIntel::eval(const int vflag, if (VFLAG && vflag) { sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; } +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5) +#else #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5) +#endif for (int n = nfrom; n < nto; n++) { #else for (int n = nfrom; n < nto; n += npl) { @@ -247,7 +251,11 @@ void ImproperCvffIntel::eval(const int vflag, flt_t p, pd; #ifdef LMP_INTEL_USE_SIMDOFF_FIX +#if defined(USE_OMP_SIMD) + #pragma omp ordered simd +#else #pragma simdoff +#endif #endif { if (m == 2) { @@ -319,7 +327,11 @@ void ImproperCvffIntel::eval(const int vflag, // apply force to each of 4 atoms #ifdef LMP_INTEL_USE_SIMDOFF_FIX +#if defined(USE_OMP_SIMD) + #pragma omp ordered simd +#else #pragma simdoff +#endif #endif { if (NEWTON_BOND || i1 < nlocal) { diff --git a/src/INTEL/improper_harmonic_intel.cpp b/src/INTEL/improper_harmonic_intel.cpp index b3d4c342d9..1a637fa1a6 100644 --- a/src/INTEL/improper_harmonic_intel.cpp +++ b/src/INTEL/improper_harmonic_intel.cpp @@ -167,7 +167,11 @@ void ImproperHarmonicIntel::eval(const int vflag, if (VFLAG && vflag) { sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; } +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5) +#else #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5) +#endif for (int n = nfrom; n < nto; n++) { #else for (int n = nfrom; n < nto; n += npl) { @@ -276,7 +280,11 @@ void ImproperHarmonicIntel::eval(const int vflag, // apply force to each of 4 atoms #ifdef LMP_INTEL_USE_SIMDOFF +#if defined(USE_OMP_SIMD) + #pragma omp ordered simd +#else #pragma simdoff +#endif #endif { if (NEWTON_BOND || i1 < nlocal) { diff --git a/src/INTEL/intel_intrinsics.h b/src/INTEL/intel_intrinsics.h index 295310283d..567f04c5dc 100644 --- a/src/INTEL/intel_intrinsics.h +++ b/src/INTEL/intel_intrinsics.h @@ -127,7 +127,8 @@ struct vector_ops { } template static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) { - return _mm512_mask_i32logather_pd(from, mask, idx, base, scale); + return _mm512_mask_i32gather_pd(from, mask, _mm512_castsi512_si256(idx), + base, scale); } static fvec blend(const bvec &mask, const fvec &a, const fvec &b) { return _mm512_mask_blend_pd(mask, a, b); diff --git a/src/INTEL/intel_intrinsics_airebo.h b/src/INTEL/intel_intrinsics_airebo.h index ac58ca2438..ea29888ea1 100644 --- a/src/INTEL/intel_intrinsics_airebo.h +++ b/src/INTEL/intel_intrinsics_airebo.h @@ -511,7 +511,8 @@ public: const int scale) { assert(scale == sizeof(FVEC_SCAL_T)); # if FVEC_LEN==8 - return FVEC_SUFFIX(_mm512_i32logather_)(idx.val_, mem, sizeof(FVEC_SCAL_T)); + return FVEC_SUFFIX(_mm512_i32gather_)(_mm512_castsi512_si256(idx.val_), + mem, sizeof(FVEC_SCAL_T)); # else return FVEC_SUFFIX(_mm512_i32gather_)(idx.val_, mem, sizeof(FVEC_SCAL_T)); # endif @@ -522,8 +523,8 @@ public: ) { assert(scale == sizeof(FVEC_SCAL_T)); # if FVEC_LEN==8 - return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_, - mem, sizeof(FVEC_SCAL_T)); + return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, + _mm512_castsi512_si256(idx.val_), mem, sizeof(FVEC_SCAL_T)); # else return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_, mem, sizeof(FVEC_SCAL_T)); @@ -609,8 +610,8 @@ public: ) { assert(scale == sizeof(FVEC_SCAL_T)); # if FVEC_LEN==8 - return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_, - mem, sizeof(FVEC_SCAL_T)); + return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, + _mm512_castsi512_si256(idx.val_), mem, sizeof(FVEC_SCAL_T)); # else return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_, mem, sizeof(FVEC_SCAL_T)); @@ -622,8 +623,9 @@ public: ) { assert(scale == sizeof(FVEC_SCAL_T)); # if FVEC_LEN==8 - FVEC_SUFFIX(_mm512_mask_i32loscatter_)(mem, mask.val_, idx.val_, a.val_, - sizeof(FVEC_SCAL_T)); + FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, + _mm512_castsi512_si256(idx.val_), + a.val_, sizeof(FVEC_SCAL_T)); # else FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, idx.val_, a.val_, sizeof(FVEC_SCAL_T)); @@ -666,11 +668,11 @@ public: const double * mem, const int scale ) { assert(scale == sizeof(double)); - __m512d lo = _mm512_mask_i32logather_pd(src.lo_, mask.val_, idx.val_, mem, - sizeof(double)); - __m512d hi = _mm512_mask_i32logather_pd(src.hi_, get_bvec_hi(mask.val_), - get_ivec_hi(idx.val_), mem, - sizeof(double)); + __m512d lo = _mm512_mask_i32gather_pd(src.lo_, mask.val_, + _mm512_castsi512_si256(idx.val_), + mem, sizeof(double)); + __m512d hi = _mm512_mask_i32gather_pd(src.hi_, get_bvec_hi(mask.val_), + _mm512_castsi512_si256(get_ivec_hi(idx.val_)), mem, sizeof(double)); return avec16pd(lo, hi); } VEC_INLINE static void mask_i32loscatter( @@ -678,10 +680,12 @@ public: const avec16pd &a, const int scale ) { assert(scale == sizeof(double)); - _mm512_mask_i32loscatter_pd(mem, mask.val_, idx.val_, a.lo_, - sizeof(double)); - _mm512_mask_i32loscatter_pd(mem, get_bvec_hi(mask.val_), - get_ivec_hi(idx.val_), a.hi_, sizeof(double)); + _mm512_mask_i32scatter_pd(mem, mask.val_, + _mm512_castsi512_si256(idx.val_), a.lo_, + sizeof(double)); + _mm512_mask_i32scatter_pd(mem, get_bvec_hi(mask.val_), + _mm512_castsi512_si256(get_ivec_hi(idx.val_)), + a.hi_, sizeof(double)); } #define AVEC2_BINOP(the_sym, the_name) \ diff --git a/src/INTEL/intel_preprocess.h b/src/INTEL/intel_preprocess.h index 0bec9935db..41c91d1578 100644 --- a/src/INTEL/intel_preprocess.h +++ b/src/INTEL/intel_preprocess.h @@ -17,8 +17,13 @@ ------------------------------------------------------------------------- */ #ifdef __INTEL_LLVM_COMPILER +#define USE_OMP_SIMD #define __INTEL_COMPILER __INTEL_LLVM_COMPILER #define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER +#define _MM_SCALE_1 1 +#define _MM_SCALE_2 2 +#define _MM_SCALE_4 4 +#define _MM_SCALE_8 8 #endif #ifdef __INTEL_COMPILER @@ -332,6 +337,9 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, #endif +// TO BE DEPRECATED +#ifndef USE_OMP_SIMD + #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \ f_stride, pos, ov0, ov1, ov2, \ ov3, ov4, ov5) \ @@ -526,6 +534,198 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, } \ } +#else + +#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \ + f_stride, pos, ov0, ov1, ov2, \ + ov3, ov4, ov5) \ +{ \ + acc_t *f_scalar = &f_start[0].x; \ + flt_t *x_scalar = &pos[minlocal].x; \ + int f_stride4 = f_stride * 4; \ + _alignvar(acc_t ovv[16],64); \ + int vwidth; \ + if (sizeof(acc_t) == sizeof(double)) \ + vwidth = INTEL_COMPILE_WIDTH/2; \ + else \ + vwidth = INTEL_COMPILE_WIDTH; \ + if (vwidth < 4) vwidth = 4; \ + _use_simd_pragma("omp simd aligned(ovv:64)") \ + for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0; \ + int remainder = lt % vwidth; \ + if (lf > lt) remainder = 0; \ + const int v_range = lt - remainder; \ + if (nthreads == 2) { \ + acc_t *f_scalar2 = f_scalar + f_stride4; \ + for (int n = lf; n < v_range; n += vwidth) { \ + _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,ovv,x_scalar:64)")\ + for (int v = 0; v < vwidth; v++) { \ + f_scalar[n+v] += f_scalar2[n+v]; \ + ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ + } \ + ov3 += f_scalar[n+1] * x_scalar[n+0]; \ + ov4 += f_scalar[n+2] * x_scalar[n+0]; \ + ov5 += f_scalar[n+2] * x_scalar[n+1]; \ + if (vwidth > 4) { \ + ov3 += f_scalar[n+5] * x_scalar[n+4]; \ + ov4 += f_scalar[n+6] * x_scalar[n+4]; \ + ov5 += f_scalar[n+6] * x_scalar[n+5]; \ + } \ + if (vwidth > 8) { \ + ov3 += f_scalar[n+9] * x_scalar[n+8]; \ + ov3 += f_scalar[n+13] * x_scalar[n+12]; \ + ov4 += f_scalar[n+10] * x_scalar[n+8]; \ + ov4 += f_scalar[n+14] * x_scalar[n+12]; \ + ov5 += f_scalar[n+10] * x_scalar[n+9]; \ + ov5 += f_scalar[n+14] * x_scalar[n+13]; \ + } \ + } \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("ivdep") \ + _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \ + for (int n = v_range; n < lt; n++) \ + f_scalar[n] += f_scalar2[n]; \ + } else if (nthreads==4) { \ + acc_t *f_scalar2 = f_scalar + f_stride4; \ + acc_t *f_scalar3 = f_scalar2 + f_stride4; \ + acc_t *f_scalar4 = f_scalar3 + f_stride4; \ + for (int n = lf; n < v_range; n += vwidth) { \ + _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4,ovv:64)") \ + for (int v = 0; v < vwidth; v++) { \ + f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] + \ + f_scalar4[n+v]; \ + ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ + } \ + ov3 += f_scalar[n+1] * x_scalar[n+0]; \ + ov4 += f_scalar[n+2] * x_scalar[n+0]; \ + ov5 += f_scalar[n+2] * x_scalar[n+1]; \ + if (vwidth > 4) { \ + ov3 += f_scalar[n+5] * x_scalar[n+4]; \ + ov4 += f_scalar[n+6] * x_scalar[n+4]; \ + ov5 += f_scalar[n+6] * x_scalar[n+5]; \ + } \ + if (vwidth > 8) { \ + ov3 += f_scalar[n+9] * x_scalar[n+8]; \ + ov3 += f_scalar[n+13] * x_scalar[n+12]; \ + ov4 += f_scalar[n+10] * x_scalar[n+8]; \ + ov4 += f_scalar[n+14] * x_scalar[n+12]; \ + ov5 += f_scalar[n+10] * x_scalar[n+9]; \ + ov5 += f_scalar[n+14] * x_scalar[n+13]; \ + } \ + } \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("ivdep") \ + _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \ + for (int n = v_range; n < lt; n++) \ + f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; \ + } else if (nthreads==1) { \ + for (int n = lf; n < v_range; n += vwidth) { \ + _use_simd_pragma("omp simd aligned(ovv,f_scalar,x_scalar:64)") \ + for (int v = 0; v < vwidth; v++) \ + ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ + ov3 += f_scalar[n+1] * x_scalar[n+0]; \ + ov4 += f_scalar[n+2] * x_scalar[n+0]; \ + ov5 += f_scalar[n+2] * x_scalar[n+1]; \ + if (vwidth > 4) { \ + ov3 += f_scalar[n+5] * x_scalar[n+4]; \ + ov4 += f_scalar[n+6] * x_scalar[n+4]; \ + ov5 += f_scalar[n+6] * x_scalar[n+5]; \ + } \ + if (vwidth > 8) { \ + ov3 += f_scalar[n+9] * x_scalar[n+8]; \ + ov3 += f_scalar[n+13] * x_scalar[n+12]; \ + ov4 += f_scalar[n+10] * x_scalar[n+8]; \ + ov4 += f_scalar[n+14] * x_scalar[n+12]; \ + ov5 += f_scalar[n+10] * x_scalar[n+9]; \ + ov5 += f_scalar[n+14] * x_scalar[n+13]; \ + } \ + } \ + } else if (nthreads==3) { \ + acc_t *f_scalar2 = f_scalar + f_stride4; \ + acc_t *f_scalar3 = f_scalar2 + f_stride4; \ + for (int n = lf; n < v_range; n += vwidth) { \ + _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,ovv,x_scalar:64)") \ + for (int v = 0; v < vwidth; v++) { \ + f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v]; \ + ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ + } \ + ov3 += f_scalar[n+1] * x_scalar[n+0]; \ + ov4 += f_scalar[n+2] * x_scalar[n+0]; \ + ov5 += f_scalar[n+2] * x_scalar[n+1]; \ + if (vwidth > 4) { \ + ov3 += f_scalar[n+5] * x_scalar[n+4]; \ + ov4 += f_scalar[n+6] * x_scalar[n+4]; \ + ov5 += f_scalar[n+6] * x_scalar[n+5]; \ + } \ + if (vwidth > 8) { \ + ov3 += f_scalar[n+9] * x_scalar[n+8]; \ + ov3 += f_scalar[n+13] * x_scalar[n+12]; \ + ov4 += f_scalar[n+10] * x_scalar[n+8]; \ + ov4 += f_scalar[n+14] * x_scalar[n+12]; \ + ov5 += f_scalar[n+10] * x_scalar[n+9]; \ + ov5 += f_scalar[n+14] * x_scalar[n+13]; \ + } \ + } \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("ivdep") \ + _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \ + for (int n = v_range; n < lt; n++) \ + f_scalar[n] += f_scalar2[n] + f_scalar3[n]; \ + } \ + for (int n = v_range; n < lt; n += 4) { \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("ivdep") \ + for (int v = 0; v < 4; v++) \ + ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ + ov3 += f_scalar[n+1] * x_scalar[n+0]; \ + ov4 += f_scalar[n+2] * x_scalar[n+0]; \ + ov5 += f_scalar[n+2] * x_scalar[n+1]; \ + } \ + ov0 += ovv[0]; \ + ov1 += ovv[1]; \ + ov2 += ovv[2]; \ + if (vwidth > 4) { \ + ov0 += ovv[4]; \ + ov1 += ovv[5]; \ + ov2 += ovv[6]; \ + } \ + if (vwidth > 8) { \ + ov0 += ovv[8] + ovv[12]; \ + ov1 += ovv[9] + ovv[13]; \ + ov2 += ovv[10] + ovv[14]; \ + } \ +} + +#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \ + f_stride, pos, offload, vflag, ov0, ov1, \ + ov2, ov3, ov4, ov5) \ +{ \ + int o_range = (nall - minlocal) * 4; \ + IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads, \ + sizeof(acc_t)); \ + \ + acc_t *f_scalar = &f_start[0].x; \ + int f_stride4 = f_stride * 4; \ + int t; \ + if (vflag == VIRIAL_FDOTR) t = 4; else t = 1; \ + acc_t *f_scalar2 = f_scalar + f_stride4 * t; \ + for ( ; t < nthreads; t++) { \ + _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2:64)") \ + for (int n = iifrom; n < iito; n++) \ + f_scalar[n] += f_scalar2[n]; \ + f_scalar2 += f_stride4; \ + } \ + \ + if (vflag == VIRIAL_FDOTR) { \ + int nt_min = MIN(4,nthreads); \ + IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start, \ + f_stride, pos, ov0, ov1, ov2, ov3, ov4, \ + ov5); \ + } \ +} + +#endif + #ifdef _LMP_INTEL_OFFLOAD #include diff --git a/src/INTEL/intel_simd.h b/src/INTEL/intel_simd.h index 165455a33d..eb5d9857a5 100644 --- a/src/INTEL/intel_simd.h +++ b/src/INTEL/intel_simd.h @@ -173,7 +173,7 @@ namespace ip_simd { } inline SIMD_double SIMD_gather(const double *p, const SIMD_int &i) { - return _mm512_i32logather_pd(i, p, _MM_SCALE_8); + return _mm512_i32gather_pd(_mm512_castsi512_si256(i), p, _MM_SCALE_8); } inline SIMD_int SIMD_gather(const SIMD_mask &m, const int *p, @@ -190,8 +190,8 @@ namespace ip_simd { inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p, const SIMD_int &i) { - return _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, p, - _MM_SCALE_8); + return _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), p, _MM_SCALE_8); } template @@ -227,8 +227,8 @@ namespace ip_simd { inline SIMD_double SIMD_gatherz(const SIMD_mask &m, const double *p, const SIMD_int &i) { - return _mm512_mask_i32logather_pd( _mm512_set1_pd(0.0), m, i, p, - _MM_SCALE_8); + return _mm512_mask_i32gather_pd( _mm512_set1_pd(0.0), m, + _mm512_castsi512_si256(i),p, _MM_SCALE_8); } // ------- Store Operations @@ -257,7 +257,8 @@ namespace ip_simd { inline void SIMD_scatter(const SIMD_mask &m, double *p, const SIMD_int &i, const SIMD_double &vec) { - _mm512_mask_i32loscatter_pd(p, m, i, vec, _MM_SCALE_8); + _mm512_mask_i32scatter_pd(p, m, _mm512_castsi512_si256(i), vec, + _MM_SCALE_8); } // ------- Arithmetic Operations @@ -834,23 +835,29 @@ namespace ip_simd { inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom, const SIMD_int &i, SIMD_double &x, SIMD_double &y, SIMD_double &z) { - x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom, - _MM_SCALE_2); - y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1, - _MM_SCALE_2); - z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2, - _MM_SCALE_2); + x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), atom, + _MM_SCALE_2); + y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), atom+1, + _MM_SCALE_2); + z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), atom+2, + _MM_SCALE_2); } inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom, const SIMD_int &i, SIMD_double &x, SIMD_double &y, SIMD_double &z, SIMD_int &type) { - x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom, - _MM_SCALE_2); - y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1, - _MM_SCALE_2); - z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2, - _MM_SCALE_2); + x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), atom, + _MM_SCALE_2); + y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), atom+1, + _MM_SCALE_2); + z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), atom+2, + _MM_SCALE_2); type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3, _MM_SCALE_2); } @@ -888,10 +895,12 @@ namespace ip_simd { const SIMD_int &joffset, SIMD_double &eng) { SIMD_double jeng; SIMD_conflict_pi_reduce1(rmask, joffset, eng); - jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, - force, _MM_SCALE_2); + jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, + _mm512_castsi512_si256(joffset), + force, _MM_SCALE_2); jeng = jeng + eng; - _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset), + jeng, _MM_SCALE_2); } inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force, @@ -899,20 +908,24 @@ namespace ip_simd { SIMD_double engd, jeng; engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng)); SIMD_conflict_pi_reduce1(rmask, joffset, engd); - jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, - force, _MM_SCALE_2); + jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, + _mm512_castsi512_si256(joffset), + force, _MM_SCALE_2); jeng = jeng + engd; - _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset), + jeng, _MM_SCALE_2); SIMD_mask rmask2 = rmask >> 8; engd = _mm512_cvtps_pd(_mm512_castps512_ps256( _mm512_shuffle_f32x4(eng,eng,238))); SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238); SIMD_conflict_pi_reduce1(rmask2, joffset2, engd); - jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, - force, _MM_SCALE_2); + jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, + _mm512_castsi512_si256(joffset2), + force, _MM_SCALE_2); jeng = jeng + engd; - _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jeng, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2), + jeng, _MM_SCALE_2); } inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force, @@ -926,10 +939,12 @@ namespace ip_simd { SIMD_double jeng; SIMD_conflict_pi_reduce1(rmask, joffset, eng); - jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, - force, _MM_SCALE_2); + jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, + _mm512_castsi512_si256(joffset), + force, _MM_SCALE_2); jeng = jeng + eng; - _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset), + jeng, _MM_SCALE_2); } inline void SIMD_safe_jforce(const SIMD_mask &m, float *force, @@ -956,18 +971,24 @@ namespace ip_simd { SIMD_double &fy, SIMD_double &fz) { SIMD_conflict_pi_reduce3(m, i, fx, fy, fz); SIMD_double jfrc; - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, - _MM_SCALE_2); + jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), force, + _MM_SCALE_2); jfrc = jfrc + fx; - _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, - _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc, + _MM_SCALE_2); + jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), force + 1, + _MM_SCALE_2); jfrc = jfrc + fy; - _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2, - _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc, + _MM_SCALE_2); + jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), force + 2, + _MM_SCALE_2); jfrc = jfrc + fz; - _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc, + _MM_SCALE_2); } inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force, @@ -979,40 +1000,54 @@ namespace ip_simd { amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz)); SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd); SIMD_double jfrc; - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, - force, _MM_SCALE_2); + jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, + _mm512_castsi512_si256(joffset), + force, _MM_SCALE_2); jfrc = jfrc + amxd; - _mm512_mask_i32loscatter_pd(force, rmask, joffset, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, - force + 1, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset), + jfrc, _MM_SCALE_2); + jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, + _mm512_castsi512_si256(joffset), + force + 1, _MM_SCALE_2); jfrc = jfrc + amyd; - _mm512_mask_i32loscatter_pd(force+1, rmask, joffset, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, - force + 2, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force+1, rmask, _mm512_castsi512_si256(joffset), + jfrc, _MM_SCALE_2); + jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, + _mm512_castsi512_si256(joffset), + force + 2, _MM_SCALE_2); jfrc = jfrc + amzd; - _mm512_mask_i32loscatter_pd(force+2, rmask, joffset, jfrc, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force+2, rmask, _mm512_castsi512_si256(joffset), + jfrc, _MM_SCALE_2); SIMD_mask rmask2 = rmask >> 8; amxd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(amx,amx,238))); + _mm512_shuffle_f32x4(amx,amx,238))); amyd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(amy,amy,238))); + _mm512_shuffle_f32x4(amy,amy,238))); amzd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(amz,amz,238))); + _mm512_shuffle_f32x4(amz,amz,238))); SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238); SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, - force, _MM_SCALE_2); + jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, + _mm512_castsi512_si256(joffset2), + force, _MM_SCALE_2); jfrc = jfrc + amxd; - _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, - force + 1, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2), + jfrc, _MM_SCALE_2); + jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, + _mm512_castsi512_si256(joffset2), + force + 1, _MM_SCALE_2); jfrc = jfrc + amyd; - _mm512_mask_i32loscatter_pd(force+1, rmask2, joffset2, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, - force + 2, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force+1, rmask2, + _mm512_castsi512_si256(joffset2), jfrc, + _MM_SCALE_2); + jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, + _mm512_castsi512_si256(joffset2), + force + 2, _MM_SCALE_2); jfrc = jfrc + amzd; - _mm512_mask_i32loscatter_pd(force+2, rmask2, joffset2, jfrc, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force+2, rmask2, + _mm512_castsi512_si256(joffset2), jfrc, + _MM_SCALE_2); } inline void SIMD_jforce_update(const SIMD_mask &m, float *force, @@ -1064,18 +1099,24 @@ namespace ip_simd { const SIMD_int &i, const SIMD_double &fx, const SIMD_double &fy, const SIMD_double &fz) { SIMD_double jfrc; - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, - _MM_SCALE_2); + jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), force, + _MM_SCALE_2); jfrc = jfrc - fx; - _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, - _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc, + _MM_SCALE_2); + jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), force + 1, + _MM_SCALE_2); jfrc = jfrc - fy; - _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2, - _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc, + _MM_SCALE_2); + jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), force + 2, + _MM_SCALE_2); jfrc = jfrc - fz; - _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc, + _MM_SCALE_2); } inline void SIMD_jforce_update(const SIMD_mask &rmask, @@ -1502,11 +1543,12 @@ namespace ip_simd { fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl); fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl); SIMD_conflict_pi_reduce1(hmask, k, hevdwl); - SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), - hmask, k, force + 3, - _MM_SCALE_2); + SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask, + _mm512_castsi512_si256(k), + force + 3, _MM_SCALE_2); keng = keng + hevdwl; - _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k), + keng, _MM_SCALE_2); } } @@ -1523,11 +1565,12 @@ namespace ip_simd { fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl); fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl); SIMD_conflict_pi_reduce1(hmask, k, hevdwl); - SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), - hmask, k, force + 3, - _MM_SCALE_2); + SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask, + _mm512_castsi512_si256(k), + force + 3, _MM_SCALE_2); keng = keng + hevdwl; - _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k), + keng, _MM_SCALE_2); } SIMD_mask hmask2 = hmask >> 8; facradd = _mm512_cvtps_pd(_mm512_castps512_ps256( @@ -1539,11 +1582,13 @@ namespace ip_simd { fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl); SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238); SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl); - SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), - hmask2, k2, force + 3, - _MM_SCALE_2); + SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), + hmask2, + _mm512_castsi512_si256(k2), + force + 3, _MM_SCALE_2); keng = keng + hevdwl; - _mm512_mask_i32loscatter_pd(force + 3, hmask2, k2, keng, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force + 3, hmask2, _mm512_castsi512_si256(k2), + keng, _MM_SCALE_2); } } @@ -1815,24 +1860,32 @@ namespace ip_simd { const int EFLAG, const int eatom, const SIMD_double &fwtmp) { SIMD_double jfrc; - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, - _MM_SCALE_2); + jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), force, + _MM_SCALE_2); jfrc = jfrc + fx; - _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, - _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc, + _MM_SCALE_2); + jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), force + 1, + _MM_SCALE_2); jfrc = jfrc + fy; - _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2, - _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc, + _MM_SCALE_2); + jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), force + 2, + _MM_SCALE_2); jfrc = jfrc + fz; - _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc, + _MM_SCALE_2); if (EFLAG) { if (eatom) { - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, - force + 3, _MM_SCALE_2); + jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + _mm512_castsi512_si256(i), + force + 3, _MM_SCALE_2); jfrc = jfrc + fwtmp; - _mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2); + _mm512_mask_i32scatter_pd(force+3, m, _mm512_castsi512_si256(i), jfrc, + _MM_SCALE_2); } } } diff --git a/src/INTEL/npair_full_bin_ghost_intel.cpp b/src/INTEL/npair_full_bin_ghost_intel.cpp index 082f95721f..e96f2c713d 100644 --- a/src/INTEL/npair_full_bin_ghost_intel.cpp +++ b/src/INTEL/npair_full_bin_ghost_intel.cpp @@ -324,7 +324,11 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, const int bstart = binhead[ibin + binstart[k]]; const int bend = binhead[ibin + binend[k]]; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int jj = bstart; jj < bend; jj++) tj[ncount++] = binpacked[jj]; @@ -345,15 +349,23 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, const int bstart = binhead[ibin + stencil[k]]; const int bend = binhead[ibin + stencil[k] + 1]; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int jj = bstart; jj < bend; jj++) tj[ncount++] = binpacked[jj]; } } // if i < nlocal #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int u = 0; u < ncount; u++) { const int j = tj[u]; @@ -425,12 +437,16 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, int alln = n; n = 0; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #ifdef LMP_INTEL_NBOR_COMPAT #pragma ivdep #else +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif + #pragma vector aligned #endif for (int u = 0; u < alln; u++) { int which; @@ -454,12 +470,16 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, alln = n2; n2 = maxnbors * 2; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #ifdef LMP_INTEL_NBOR_COMPAT #pragma ivdep #else +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif + #pragma vector aligned #endif for (int u = n2; u < alln; u++) { int which; diff --git a/src/INTEL/npair_intel.cpp b/src/INTEL/npair_intel.cpp index 643ceff8f3..395e50006c 100644 --- a/src/INTEL/npair_intel.cpp +++ b/src/INTEL/npair_intel.cpp @@ -344,14 +344,22 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, const int bstart = binhead[ibin + binstart[k]]; const int bend = binhead[ibin + binend[k]]; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int jj = bstart; jj < bend; jj++) tj[ncount++] = binpacked[jj]; } #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int u = 0; u < ncount; u++) { const int j = tj[u]; @@ -375,7 +383,11 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, const int bstart = binhead[ibin]; const int bend = binhead[ibin + 1]; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int jj = bstart; jj < bend; jj++) { const int j = binpacked[jj]; @@ -533,12 +545,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, n = pack_offset; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #ifdef LMP_INTEL_NBOR_COMPAT #pragma ivdep #else +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif + #pragma vector aligned #endif for (int u = n; u < alln; u++) { int which; @@ -566,12 +582,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, n2 = pack_offset + maxnbors; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #ifdef LMP_INTEL_NBOR_COMPAT #pragma ivdep #else +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif + #pragma vector aligned #endif for (int u = n2; u < alln; u++) { int which; @@ -737,8 +757,14 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, int jnum = numneigh[i]; if (!THREE) IP_PRE_neighbor_pad(jnum, offload); #if __INTEL_COMPILER+0 > 1499 +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(max:vlmax,vgmax) \ + reduction(min:vlmin, vgmin) +#else + #pragma simd reduction(max:vlmax,vgmax) \ + reduction(min:vlmin, vgmin) +#endif #pragma vector aligned - #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin) #endif for (int jj = 0; jj < jnum; jj++) { const int j = jlist[jj] & NEIGHMASK; @@ -782,8 +808,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, int jnum = numneigh[i]; if (!THREE) IP_PRE_neighbor_pad(jnum, offload); int jj = 0; - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned for (jj = 0; jj < jnum; jj++) { const int which = jlist[jj] >> SBBITS & 3; const int j = jlist[jj] & NEIGHMASK; diff --git a/src/INTEL/pair_buck_coul_cut_intel.cpp b/src/INTEL/pair_buck_coul_cut_intel.cpp index 99905bfaa0..c67450fbc1 100644 --- a/src/INTEL/pair_buck_coul_cut_intel.cpp +++ b/src/INTEL/pair_buck_coul_cut_intel.cpp @@ -248,12 +248,18 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, fxtmp = fytmp = fztmp = (acc_t)0; if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; if (NEWTON_PAIR == 0) - if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + if (vflag == VIRIAL_PAIR) + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + sv0, sv1, sv2, sv3, sv4, sv5) +#else #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ sv0, sv1, sv2, sv3, sv4, sv5) +#endif + #pragma vector aligned #endif for (int jj = 0; jj < jnum; jj++) { flt_t forcecoul, forcebuck, evdwl, ecoul; diff --git a/src/INTEL/pair_buck_coul_long_intel.cpp b/src/INTEL/pair_buck_coul_long_intel.cpp index 1566ec23b6..7c795d5914 100644 --- a/src/INTEL/pair_buck_coul_long_intel.cpp +++ b/src/INTEL/pair_buck_coul_long_intel.cpp @@ -309,9 +309,14 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, } #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + secoul, sv0, sv1, sv2, sv3, sv4, sv5) +#else + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + secoul, sv0, sv1, sv2, sv3, sv4, sv5) +#endif #pragma vector aligned - #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ - sv0, sv1, sv2, sv3, sv4, sv5) #endif for (int jj = 0; jj < ej; jj++) { flt_t forcecoul, forcebuck, evdwl, ecoul; diff --git a/src/INTEL/pair_buck_intel.cpp b/src/INTEL/pair_buck_intel.cpp index 26ef13be9a..ddab17765b 100644 --- a/src/INTEL/pair_buck_intel.cpp +++ b/src/INTEL/pair_buck_intel.cpp @@ -230,12 +230,18 @@ void PairBuckIntel::eval(const int offload, const int vflag, fxtmp = fytmp = fztmp = (acc_t)0; if (EFLAG) fwtmp = sevdwl = (acc_t)0; if (NEWTON_PAIR == 0) - if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + if (vflag == VIRIAL_PAIR) + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + sv0, sv1, sv2, sv3, sv4, sv5) +#else #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ sv0, sv1, sv2, sv3, sv4, sv5) +#endif + #pragma vector aligned #endif for (int jj = 0; jj < jnum; jj++) { diff --git a/src/INTEL/pair_dpd_intel.cpp b/src/INTEL/pair_dpd_intel.cpp index e7514a1f95..a9eb4fe6a4 100644 --- a/src/INTEL/pair_dpd_intel.cpp +++ b/src/INTEL/pair_dpd_intel.cpp @@ -289,9 +289,14 @@ void PairDPDIntel::eval(const int offload, const int vflag, } #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + sv0, sv1, sv2, sv3, sv4, sv5) +#else #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ - sv0, sv1, sv2, sv3, sv4, sv5) + sv0, sv1, sv2, sv3, sv4, sv5) +#endif + #pragma vector aligned #endif for (int jj = 0; jj < jnum; jj++) { flt_t forcelj, evdwl; diff --git a/src/INTEL/pair_eam_intel.cpp b/src/INTEL/pair_eam_intel.cpp index dcff8957fd..13dbd60cb3 100644 --- a/src/INTEL/pair_eam_intel.cpp +++ b/src/INTEL/pair_eam_intel.cpp @@ -327,8 +327,12 @@ void PairEAMIntel::eval(const int offload, const int vflag, } #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:rhoi) +#else #pragma simd reduction(+:rhoi) +#endif + #pragma vector aligned #endif for (int jj = 0; jj < ej; jj++) { int jtype; @@ -369,23 +373,35 @@ void PairEAMIntel::eval(const int offload, const int vflag, const int rcount = nall; if (nthreads == 2) { double *trho2 = rho + nmax; - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned for (int n = 0; n < rcount; n++) rho[n] += trho2[n]; } else if (nthreads == 4) { double *trho2 = rho + nmax; double *trho3 = trho2 + nmax; double *trho4 = trho3 + nmax; - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned for (int n = 0; n < rcount; n++) rho[n] += trho2[n] + trho3[n] + trho4[n]; } else { double *trhon = rho + nmax; for (int t = 1; t < nthreads; t++) { - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned for (int n = 0; n < rcount; n++) rho[n] += trhon[n]; trhon += nmax; @@ -414,8 +430,12 @@ void PairEAMIntel::eval(const int offload, const int vflag, if (EFLAG) tevdwl = (acc_t)0.0; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:tevdwl) +#else #pragma simd reduction(+:tevdwl) +#endif + #pragma vector aligned #endif for (int ii = iifrom; ii < iito; ++ii) { const int i = ilist[ii]; @@ -510,9 +530,14 @@ void PairEAMIntel::eval(const int offload, const int vflag, } #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + sv0, sv1, sv2, sv3, sv4, sv5) +#else #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ - sv0, sv1, sv2, sv3, sv4, sv5) + sv0, sv1, sv2, sv3, sv4, sv5) +#endif + #pragma vector aligned #endif for (int jj = 0; jj < ej; jj++) { int jtype; diff --git a/src/INTEL/pair_gayberne_intel.cpp b/src/INTEL/pair_gayberne_intel.cpp index d7becc7585..c3abf68c12 100644 --- a/src/INTEL/pair_gayberne_intel.cpp +++ b/src/INTEL/pair_gayberne_intel.cpp @@ -449,9 +449,14 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, __assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0); #endif #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp, \ + t3tmp,sevdwl,sv0,sv1,sv2,sv3,sv4,sv5) +#else + #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp, \ + t3tmp,sevdwl,sv0,sv1,sv2,sv3,sv4,sv5) +#endif #pragma vector aligned - #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \ - sevdwl,sv0,sv1,sv2,sv3,sv4,sv5) #endif for (int jj = 0; jj < packed_j; jj++) { flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8; @@ -806,8 +811,12 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, acc_t *f_scalar2 = f_scalar + fst4; for (int t = 1; t < nthreads; t++) { #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma vector aligned #endif for (int n = iifrom * 8; n < sto; n++) f_scalar[n] += f_scalar2[n]; diff --git a/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp b/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp index ad8ef4d84f..ef26f8f2d5 100644 --- a/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp +++ b/src/INTEL/pair_lj_charmm_coul_charmm_intel.cpp @@ -294,9 +294,14 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag, } #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + secoul, sv0, sv1, sv2, sv3, sv4, sv5) +#else + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + secoul, sv0, sv1, sv2, sv3, sv4, sv5) +#endif #pragma vector aligned - #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ - sv0, sv1, sv2, sv3, sv4, sv5) #endif for (int jj = 0; jj < ej; jj++) { flt_t forcecoul, forcelj, evdwl; diff --git a/src/INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/INTEL/pair_lj_charmm_coul_long_intel.cpp index a910c74acb..6f6bb3618e 100644 --- a/src/INTEL/pair_lj_charmm_coul_long_intel.cpp +++ b/src/INTEL/pair_lj_charmm_coul_long_intel.cpp @@ -314,9 +314,14 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, } #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + secoul, sv0, sv1, sv2, sv3, sv4, sv5) +#else + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + secoul, sv0, sv1, sv2, sv3, sv4, sv5) +#endif #pragma vector aligned - #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ - sv0, sv1, sv2, sv3, sv4, sv5) #endif for (int jj = 0; jj < ej; jj++) { flt_t forcecoul, forcelj, evdwl, ecoul; diff --git a/src/INTEL/pair_lj_cut_coul_long_intel.cpp b/src/INTEL/pair_lj_cut_coul_long_intel.cpp index 51e208314b..0d94fdb4c3 100644 --- a/src/INTEL/pair_lj_cut_coul_long_intel.cpp +++ b/src/INTEL/pair_lj_cut_coul_long_intel.cpp @@ -305,9 +305,14 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, } #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + secoul, sv0, sv1, sv2, sv3, sv4, sv5) +#else + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + secoul, sv0, sv1, sv2, sv3, sv4, sv5) +#endif #pragma vector aligned - #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ - sv0, sv1, sv2, sv3, sv4, sv5) #endif for (int jj = 0; jj < ej; jj++) { flt_t forcecoul, forcelj, evdwl, ecoul; diff --git a/src/INTEL/pair_lj_cut_intel.cpp b/src/INTEL/pair_lj_cut_intel.cpp index 84bc664e18..cf84cb3ca5 100644 --- a/src/INTEL/pair_lj_cut_intel.cpp +++ b/src/INTEL/pair_lj_cut_intel.cpp @@ -241,9 +241,15 @@ void PairLJCutIntel::eval(const int offload, const int vflag, if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + sv0, sv1, sv2, sv3, sv4, sv5) \ + aligned(jlist,x,ljc12oi,special_lj,f,lj34i:64) +#else #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ sv0, sv1, sv2, sv3, sv4, sv5) + #pragma vector aligned +#endif #endif for (int jj = 0; jj < jnum; jj++) { flt_t forcelj, evdwl; diff --git a/src/INTEL/pair_sw_intel.cpp b/src/INTEL/pair_sw_intel.cpp index 17dffa2843..57a6b29945 100644 --- a/src/INTEL/pair_sw_intel.cpp +++ b/src/INTEL/pair_sw_intel.cpp @@ -371,8 +371,12 @@ void PairSWIntel::eval(const int offload, const int vflag, } #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl) +#else #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl) +#endif + #pragma vector aligned #endif for (int jj = 0; jj < ejnum_pad; jj++) { acc_t fjxtmp, fjytmp, fjztmp, fjtmp; diff --git a/src/INTEL/pppm_disp_intel.cpp b/src/INTEL/pppm_disp_intel.cpp index 8d4ed1778d..6b732ccfac 100644 --- a/src/INTEL/pppm_disp_intel.cpp +++ b/src/INTEL/pppm_disp_intel.cpp @@ -770,8 +770,12 @@ void PPPMDispIntel::particle_map(double delx, double dely, double delz, IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T)); #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:flag) +#else #pragma simd reduction(+:flag) +#endif + #pragma vector aligned #endif for (int i = iifrom; i < iito; i++) { @@ -876,7 +880,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers * /*buffers*/) dz = dz*half_rho_scale + half_rho_scale_plus; int idz = dz; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho[0][k] = rho_lookup[idx][k]; @@ -885,7 +893,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers * /*buffers*/) } } else { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = nlower; k <= nupper; k++) { FFT_SCALAR r1,r2,r3; @@ -917,8 +929,12 @@ void PPPMDispIntel::make_rho_c(IntelBuffers * /*buffers*/) int mzy = m*nix + mz; FFT_SCALAR x0 = y0*rho[1][m]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int l = 0; l < order; l++) { int mzyx = l + mzy; @@ -939,7 +955,11 @@ void PPPMDispIntel::make_rho_c(IntelBuffers * /*buffers*/) IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr); #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int i = ifrom; i < ito; i++) { for (int j = 1; j < nthr; j++) { @@ -1025,7 +1045,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers * /*buffers*/) dz = dz*half_rho_scale + half_rho_scale_plus; int idz = dz; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho[0][k] = rho6_lookup[idx][k]; @@ -1034,7 +1058,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers * /*buffers*/) } } else { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = nlower_6; k <= nupper_6; k++) { FFT_SCALAR r1,r2,r3; @@ -1067,8 +1095,12 @@ void PPPMDispIntel::make_rho_g(IntelBuffers * /*buffers*/) int mzy = m*nix + mz; FFT_SCALAR x0 = y0*rho[1][m]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int l = 0; l < order; l++) { int mzyx = l + mzy; @@ -1089,7 +1121,11 @@ void PPPMDispIntel::make_rho_g(IntelBuffers * /*buffers*/) IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr); #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int i = ifrom; i < ito; i++) { for (int j = 1; j < nthr; j++) { @@ -1173,7 +1209,11 @@ void PPPMDispIntel::make_rho_a(IntelBuffers * /*buffers*/) dz = dz*half_rho_scale + half_rho_scale_plus; int idz = dz; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho[0][k] = rho6_lookup[idx][k]; @@ -1182,7 +1222,11 @@ void PPPMDispIntel::make_rho_a(IntelBuffers * /*buffers*/) } } else { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = nlower_6; k <= nupper_6; k++) { FFT_SCALAR r1,r2,r3; @@ -1215,8 +1259,12 @@ void PPPMDispIntel::make_rho_a(IntelBuffers * /*buffers*/) int my = m + nysum; FFT_SCALAR x0 = y0*rho[1][m]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int l = 0; l < order; l++) { int mx = l + nxsum; @@ -1307,7 +1355,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers * /*buffers*/) dz = dz*half_rho_scale + half_rho_scale_plus; int idz = dz; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho[0][k] = rho6_lookup[idx][k]; @@ -1316,7 +1368,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers * /*buffers*/) } } else { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = nlower_6; k <= nupper_6; k++) { FFT_SCALAR r1,r2,r3; @@ -1349,8 +1405,12 @@ void PPPMDispIntel::make_rho_none(IntelBuffers * /*buffers*/) int mzy = m*nix + mz; FFT_SCALAR x0 = y0*rho[1][m]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int l = 0; l < order; l++) { int mzyx = l + mzy; @@ -1373,7 +1433,11 @@ void PPPMDispIntel::make_rho_none(IntelBuffers * /*buffers*/) IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr); #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int i = ifrom; i < ito; i++) { for (int j = 1; j < nthr; j++) { @@ -1454,7 +1518,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers * /*buffers*/) dz = dz*half_rho_scale + half_rho_scale_plus; int idz = dz; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho0[k] = rho_lookup[idx][k]; @@ -1463,7 +1531,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers * /*buffers*/) } } else { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = nlower; k <= nupper; k++) { FFT_SCALAR r1 = rho_coeff[order-1][k]; @@ -1498,8 +1570,12 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers * /*buffers*/) int my = m+nysum; FFT_SCALAR y0 = z0*rho1[m]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int l = 0; l < order; l++) { int mx = l+nxsum; @@ -1624,7 +1700,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers * /*buffers*/) int idz = dz; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho[0][k] = rho_lookup[idx][k]; @@ -1636,7 +1716,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers * /*buffers*/) } } else { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = nlower; k <= nupper; k++) { FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; @@ -1680,8 +1764,12 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers * /*buffers*/) FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int l = 0; l < order; l++) { int mx = l + nxsum; @@ -1702,7 +1790,11 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers * /*buffers*/) } } #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int i = ifrom; i < ito; i++) { particle_ekx[i] *= hx_inv; @@ -1802,7 +1894,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers * /*buffers*/) dz = dz*half_rho_scale + half_rho_scale_plus; int idz = dz; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho0[k] = rho6_lookup[idx][k]; @@ -1811,7 +1907,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers * /*buffers*/) } } else { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = nlower_6; k <= nupper_6; k++) { FFT_SCALAR r1 = rho_coeff_6[order_6-1][k]; @@ -1846,8 +1946,12 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers * /*buffers*/) int my = m+nysum; FFT_SCALAR y0 = z0*rho1[m]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int l = 0; l < order; l++) { int mx = l+nxsum; @@ -1967,7 +2071,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers * /*buffers*/) int idz = dz; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho[0][k] = rho6_lookup[idx][k]; @@ -1979,7 +2087,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers * /*buffers*/) } } else { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = nlower_6; k <= nupper_6; k++) { FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; @@ -2023,8 +2135,12 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers * /*buffers*/) FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int l = 0; l < order; l++) { int mx = l + nxsum; @@ -2045,7 +2161,11 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers * /*buffers*/) } } #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int i = ifrom; i < ito; i++) { particle_ekx[i] *= hx_inv; @@ -2143,7 +2263,11 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers * /*buffers*/) dz = dz*half_rho_scale + half_rho_scale_plus; int idz = dz; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho0[k] = rho6_lookup[idx][k]; @@ -2152,7 +2276,11 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers * /*buffers*/) } } else { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = nlower_6; k <= nupper_6; k++) { FFT_SCALAR r1 = rho_coeff_6[order_6-1][k]; @@ -2206,8 +2334,12 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers * /*buffers*/) int my = m+nysum; FFT_SCALAR y0 = z0*rho1[m]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int l = 0; l < order; l++) { int mx = l+nxsum; @@ -2398,7 +2530,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers * /*buffers*/) int idz = dz; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho[0][k] = rho6_lookup[idx][k]; @@ -2410,7 +2546,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers * /*buffers*/) } } else { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = nlower_6; k <= nupper_6; k++) { FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; @@ -2479,8 +2619,12 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers * /*buffers*/) FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int l = 0; l < order; l++) { int mx = l + nxsum; @@ -2541,7 +2685,11 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers * /*buffers*/) } } #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int i = ifrom; i < ito; i++) { particle_ekx0[i] *= hx_inv; @@ -2671,7 +2819,11 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers * /*buffers*/) dz = dz*half_rho_scale + half_rho_scale_plus; int idz = dz; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho0[k] = rho6_lookup[idx][k]; @@ -2680,7 +2832,11 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers * /*buffers*/) } } else { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = nlower_6; k <= nupper_6; k++) { FFT_SCALAR r1 = rho_coeff_6[order_6-1][k]; @@ -2721,8 +2877,12 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers * /*buffers*/) int my = m+nysum; FFT_SCALAR y0 = z0*rho1[m]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int l = 0; l < order; l++) { int mx = l+nxsum; @@ -2848,7 +3008,11 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers * /*buffers*/) int idz = dz; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho[0][k] = rho6_lookup[idx][k]; @@ -2860,7 +3024,11 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers * /*buffers*/) } } else { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = nlower_6; k <= nupper_6; k++) { FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; @@ -2909,8 +3077,12 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers * /*buffers*/) FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int l = 0; l < order; l++) { int mx = l + nxsum; @@ -2992,7 +3164,11 @@ void PPPMDispIntel::precompute_rho() for (int i = 0; i < rho_points; i++) { FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k=nlower; k<=nupper;k++) { FFT_SCALAR r1 = ZEROF; @@ -3006,7 +3182,11 @@ void PPPMDispIntel::precompute_rho() } if (differentiation_flag == 1) { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k=nlower; k<=nupper;k++) { FFT_SCALAR r1 = ZEROF; @@ -3026,7 +3206,11 @@ void PPPMDispIntel::precompute_rho() for (int i = 0; i < rho_points; i++) { FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k=nlower_6; k<=nupper_6;k++) { FFT_SCALAR r1 = ZEROF; @@ -3040,7 +3224,11 @@ void PPPMDispIntel::precompute_rho() } if (differentiation_flag == 1) { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k=nlower_6; k<=nupper_6;k++) { FFT_SCALAR r1 = ZEROF; diff --git a/src/INTEL/pppm_intel.cpp b/src/INTEL/pppm_intel.cpp index 8b0542d770..8041709ebc 100644 --- a/src/INTEL/pppm_intel.cpp +++ b/src/INTEL/pppm_intel.cpp @@ -394,8 +394,12 @@ void PPPMIntel::particle_map(IntelBuffers *buffers) IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T)); #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned +#if defined(USE_OMP_SIMD) + #pragma omp simd reduction(+:flag) +#else #pragma simd reduction(+:flag) +#endif + #pragma vector aligned #endif for (int i = iifrom; i < iito; i++) { @@ -500,7 +504,11 @@ void PPPMIntel::make_rho(IntelBuffers *buffers) dz = dz*half_rho_scale + half_rho_scale_plus; int idz = dz; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho[0][k] = rho_lookup[idx][k]; @@ -509,7 +517,11 @@ void PPPMIntel::make_rho(IntelBuffers *buffers) } } else { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = nlower; k <= nupper; k++) { FFT_SCALAR r1,r2,r3; @@ -541,7 +553,11 @@ void PPPMIntel::make_rho(IntelBuffers *buffers) int mzy = m*nix + mz; FFT_SCALAR x0 = y0*rho[1][m]; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { int mzyx = l + mzy; @@ -563,7 +579,11 @@ void PPPMIntel::make_rho(IntelBuffers *buffers) IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr); #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int i = ifrom; i < ito; i++) { for (int j = 1; j < nthr; j++) { @@ -645,7 +665,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers *buffers) dz = dz*half_rho_scale + half_rho_scale_plus; int idz = dz; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho0[k] = rho_lookup[idx][k]; @@ -654,7 +678,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers *buffers) } } else { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = nlower; k <= nupper; k++) { FFT_SCALAR r1 = rho_coeff[order-1][k]; @@ -690,7 +718,11 @@ void PPPMIntel::fieldforce_ik(IntelBuffers *buffers) int my = m+nysum; FFT_SCALAR y0 = z0*rho1[m]; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { int mx = l+nxsum; @@ -813,7 +845,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers *buffers) dz = dz*half_rho_scale + half_rho_scale_plus; int idz = dz; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho[0][k] = rho_lookup[idx][k]; @@ -825,7 +861,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers *buffers) } } else { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k = nlower; k <= nupper; k++) { FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; @@ -871,7 +911,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers *buffers) FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { int mx = l + nxsum; @@ -893,7 +937,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers *buffers) } #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int i = ifrom; i < ito; i++) { particle_ekx[i] *= hx_inv; @@ -942,7 +990,11 @@ void PPPMIntel::precompute_rho() for (int i = 0; i < rho_points; i++) { FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i; #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k=nlower; k<=nupper;k++) { FFT_SCALAR r1 = ZEROF; @@ -956,7 +1008,11 @@ void PPPMIntel::precompute_rho() } if (differentiation_flag == 1) { #if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) + #pragma omp simd +#else #pragma simd +#endif #endif for (int k=nlower; k<=nupper;k++) { FFT_SCALAR r1 = ZEROF;