diff --git a/src/INTEL/intel_intrinsics.h b/src/INTEL/intel_intrinsics.h index a17cc2859a..295310283d 100644 --- a/src/INTEL/intel_intrinsics.h +++ b/src/INTEL/intel_intrinsics.h @@ -85,7 +85,11 @@ struct vector_ops {}; // Intrinsic routines for IMCI and AVX-512 #if defined(__MIC__) || defined(__AVX512F__) // Integer vector class +#ifdef __INTEL_LLVM_COMPILER +#pragma pack(push,16) +#else #pragma pack(push,64) +#endif struct ivec32x16 { __m512i vec; ivec32x16() {} @@ -113,7 +117,7 @@ struct vector_ops { typedef double fscal; typedef F64vec8 fvec; typedef ivec32x16 ivec; - typedef __mmask bvec; + typedef __mmask16 bvec; typedef double farr[8] __attribute__((aligned(64))); typedef int iarr[16] __attribute__((aligned(64))); static fvec recip(const fvec &a) { return _mm512_recip_pd(a); } @@ -250,7 +254,7 @@ struct vector_ops { typedef float fscal; typedef F32vec16 fvec; typedef ivec32x16 ivec; - typedef __mmask bvec; + typedef __mmask16 bvec; typedef float farr[16] __attribute__((aligned(64))); typedef int iarr[16] __attribute__((aligned(64))); static const bvec full_mask = 0xFFFF; @@ -380,16 +384,18 @@ struct vector_ops { *r3 = gather<4>(*r3, mask, idxs, reinterpret_cast(base) + 12); } // Additional routines needed for the implementation of mixed precision - static fvec cvtdown(const vector_ops::fvec &lo, const vector_ops::fvec &hi) { + static fvec cvtdown(const vector_ops::fvec &lo, + const vector_ops::fvec &hi) { __m512 t1 = _mm512_cvtpd_pslo(lo); __m512 t2 = _mm512_cvtpd_pslo(hi); - return _mm512_mask_permute4f128_ps(t1, 0xFF00, t2, _MM_PERM_BADC); + return _mm512_mask_shuffle_f32x4(_mm512_undefined_ps(), 0xFF00, t2, t2, + 0x4E); } static vector_ops::fvec cvtup_lo(const fvec &a) { return _mm512_cvtpslo_pd(a); } static vector_ops::fvec cvtup_hi(const fvec &a) { - return _mm512_cvtpslo_pd(_mm512_permute4f128_ps(a, _MM_PERM_BADC)); // permute DCBA -> BADC + return _mm512_cvtpslo_pd(_mm512_shuffle_f32x4(a, a, 0x4E)); } static void mask_cvtup(const bvec &a, vector_ops::bvec *blo, vector_ops::bvec *bhi) { *blo = a & 0xFF; @@ -1692,7 +1698,7 @@ struct vector_ops { typedef flt_t fscal; typedef flt_t fvec; typedef int ivec; - typedef bool bvec; + typedef int bvec; typedef flt_t farr[1]; typedef int iarr[1]; static fvec recip(const fvec &a) { diff --git a/src/INTEL/intel_intrinsics_airebo.h b/src/INTEL/intel_intrinsics_airebo.h index f49abbaf3d..ac58ca2438 100644 --- a/src/INTEL/intel_intrinsics_airebo.h +++ b/src/INTEL/intel_intrinsics_airebo.h @@ -651,12 +651,12 @@ class avec16pd { return a >> 8; } VEC_INLINE static __m512i get_ivec_hi(__m512i a) { - return _mm512_permute4f128_epi32(a, _MM_PERM_BADC); + return _mm512_shuffle_i32x4(a, a, 0x4E); } public: VEC_INLINE avec16pd(const FVEC_NAME &a) { lo_ = _mm512_cvtpslo_pd(a.val_); - hi_ = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(a.val_, _MM_PERM_BADC)); + hi_ = _mm512_cvtpslo_pd(_mm512_shuffle_f32x4(a.val_, a.val_, 0x4E)); } VEC_INLINE static avec16pd undefined() { return avec16pd(_mm512_undefined_pd(), _mm512_undefined_pd()); diff --git a/src/INTEL/intel_preprocess.h b/src/INTEL/intel_preprocess.h index 6f9b37a700..0bec9935db 100644 --- a/src/INTEL/intel_preprocess.h +++ b/src/INTEL/intel_preprocess.h @@ -16,6 +16,11 @@ Contributing author: W. Michael Brown (Intel) ------------------------------------------------------------------------- */ +#ifdef __INTEL_LLVM_COMPILER +#define __INTEL_COMPILER __INTEL_LLVM_COMPILER +#define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER +#endif + #ifdef __INTEL_COMPILER #define LMP_SIMD_COMPILER #if (__INTEL_COMPILER_BUILD_DATE > 20160720) diff --git a/src/INTEL/pair_airebo_intel.cpp b/src/INTEL/pair_airebo_intel.cpp index 8b2eadbe72..0037ac822d 100644 --- a/src/INTEL/pair_airebo_intel.cpp +++ b/src/INTEL/pair_airebo_intel.cpp @@ -2332,7 +2332,7 @@ static void aut_rebo_neigh(KernelArgsAIREBOT * ka) { int n_skin = 0; int lowest_idx; - #pragma unroll(4) + //#pragma unroll(4) for (lowest_idx = 0; lowest_idx < jnum; lowest_idx += fvec::VL) { bvec j_mask = bvec::full(); if (lowest_idx + fvec::VL > jnum) j_mask = bvec::only(jnum - lowest_idx); diff --git a/src/INTEL/pair_dpd_intel.cpp b/src/INTEL/pair_dpd_intel.cpp index 0e872efdf6..87226d92dc 100644 --- a/src/INTEL/pair_dpd_intel.cpp +++ b/src/INTEL/pair_dpd_intel.cpp @@ -284,7 +284,7 @@ void PairDPDIntel::eval(const int offload, const int vflag, } #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned nog2s + #pragma vector aligned #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ sv0, sv1, sv2, sv3, sv4, sv5) #endif diff --git a/src/INTEL/pair_eam_intel.cpp b/src/INTEL/pair_eam_intel.cpp index 04724f599c..aa07a5649b 100644 --- a/src/INTEL/pair_eam_intel.cpp +++ b/src/INTEL/pair_eam_intel.cpp @@ -306,7 +306,7 @@ void PairEAMIntel::eval(const int offload, const int vflag, acc_t rhoi = (acc_t)0.0; int ej = 0; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned nog2s + #pragma vector aligned #pragma ivdep #endif for (int jj = 0; jj < jnum; jj++) { @@ -325,7 +325,7 @@ void PairEAMIntel::eval(const int offload, const int vflag, } #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned nog2s + #pragma vector aligned #pragma simd reduction(+:rhoi) #endif for (int jj = 0; jj < ej; jj++) { @@ -412,7 +412,7 @@ void PairEAMIntel::eval(const int offload, const int vflag, if (EFLAG) tevdwl = (acc_t)0.0; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned nog2s + #pragma vector aligned #pragma simd reduction(+:tevdwl) #endif for (int ii = iifrom; ii < iito; ++ii) { @@ -486,7 +486,7 @@ void PairEAMIntel::eval(const int offload, const int vflag, int ej = 0; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned nog2s + #pragma vector aligned #pragma ivdep #endif for (int jj = 0; jj < jnum; jj++) { @@ -508,7 +508,7 @@ void PairEAMIntel::eval(const int offload, const int vflag, } #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned nog2s + #pragma vector aligned #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ sv0, sv1, sv2, sv3, sv4, sv5) #endif diff --git a/src/INTEL/pair_lj_cut_intel.cpp b/src/INTEL/pair_lj_cut_intel.cpp index 71e12d7b24..32cc622829 100644 --- a/src/INTEL/pair_lj_cut_intel.cpp +++ b/src/INTEL/pair_lj_cut_intel.cpp @@ -237,7 +237,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag, if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned nog2s + #pragma vector aligned #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ sv0, sv1, sv2, sv3, sv4, sv5) #endif diff --git a/src/INTEL/pair_tersoff_intel.cpp b/src/INTEL/pair_tersoff_intel.cpp index 707e8404ff..51574b0d1e 100644 --- a/src/INTEL/pair_tersoff_intel.cpp +++ b/src/INTEL/pair_tersoff_intel.cpp @@ -367,11 +367,11 @@ void PairTersoffIntel::eval(const int offload, const int vflag, lmp_intel::vector_traits::support_integer_and_gather_ops; bool use_scalar = VL < 4; if (use_scalar) { - IntelKernelTersoff::kernel(ARGS); + IntelKernelTersoff::template kernel(ARGS); } else if (pack_i) { - IntelKernelTersoff::kernel(ARGS); + IntelKernelTersoff::template kernel(ARGS); } else { - IntelKernelTersoff::kernel(ARGS); + IntelKernelTersoff::template kernel(ARGS); } if (EFLAG) oevdwl += sevdwl; } @@ -691,7 +691,8 @@ void IntelKernelTersoff::kernel_step( fvec vrijsq = vdx_ij * vdx_ij + vdy_ij * vdy_ij + vdz_ij * vdz_ij; fvec vrij = sqrt(vrijsq); ivec vis_orig = v::int_load_vl(is); - ivec vnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, numneigh); + ivec vnumneigh_i = v::template int_gather<4>(v_i0, vmask, vis_orig, + numneigh); ivec vc_idx_ij = v::int_mullo(v_i4floats, vw_j + v::int_mullo(v_i_ntypes, vw_i)); fvec vzeta = v::zero(); @@ -718,14 +719,16 @@ void IntelKernelTersoff::kernel_step( while (! v::mask_testz(vactive_mask) && cache_idx < N_CACHE) { bvec vnew_mask = vactive_mask & ~ veff_old_mask; vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK & - v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh)); + (v::template int_gather<4>(vks, vactive_mask, + vkks + vcnumneigh_i, + firstneigh))); v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k); fvec vdx_ik = (vx_k - vx_i); fvec vdy_ik = (vy_k - vy_i); fvec vdz_ik = (vz_k - vz_i); fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik; ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij); - vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner); + vcutsq = v::template gather<4>(vcutsq, vnew_mask, vc_idx, c_inner); bvec vcutoff_mask = v::cmplt(vrsq, vcutsq); bvec vsame_mask = v::int_cmpneq(vjs, vks); bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask; @@ -769,14 +772,16 @@ void IntelKernelTersoff::kernel_step( while (! v::mask_testz(vactive_mask)) { bvec vnew_mask = vactive_mask & ~ veff_old_mask; vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK & - v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh)); + (v::template int_gather<4>(vks, vactive_mask, + vkks + vcnumneigh_i, + firstneigh))); v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k); fvec vdx_ik = (vx_k - vx_i); fvec vdy_ik = (vy_k - vy_i); fvec vdz_ik = (vz_k - vz_i); fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik; ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij); - vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner); + vcutsq = v::template gather<4>(vcutsq, vnew_mask, vc_idx, c_inner); bvec vcutoff_mask = v::cmplt(vrsq, vcutsq); bvec vsame_mask = v::int_cmpneq(vjs, vks); bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask; @@ -836,14 +841,16 @@ void IntelKernelTersoff::kernel_step( while (! v::mask_testz(vactive_mask)) { bvec vnew_mask = vactive_mask & ~ veff_old_mask; vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK & - v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh)); + (v::template int_gather<4>(vks, vactive_mask, + vkks + vcnumneigh_i, + firstneigh))); v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k); fvec vdx_ik = vx_k - vx_i; fvec vdy_ik = vy_k - vy_i; fvec vdz_ik = vz_k - vz_i; fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik; ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij); - vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner); + vcutsq = v::template gather<4>(vcutsq, vnew_mask, vc_idx, c_inner); bvec vcutoff_mask = v::cmplt(vrsq, vcutsq); bvec vsame_mask = v::int_cmpneq(vjs, vks); bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask; @@ -991,7 +998,7 @@ void IntelKernelTersoff::kernel_step_const_i( fvec vdy_ik = vy_k - vy_i; fvec vdz_ik = vz_k - vz_i; fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik; - fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]); + fvec vcutsq = v::template gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]); bvec vcutoff_mask = v::cmplt(vrsq, vcutsq); bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast(4 * sizeof(typename v::fscal) * k))); bvec veff_mask = vcutoff_mask & vsame_mask & vmask; @@ -1035,7 +1042,7 @@ void IntelKernelTersoff::kernel_step_const_i( fvec vdy_ik = vy_k - vy_i; fvec vdz_ik = vz_k - vz_i; fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik; - fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]); + fvec vcutsq = v::template gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]); bvec vcutoff_mask = v::cmplt(vrsq, vcutsq); bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast(4 * sizeof(typename v::fscal) * k))); bvec veff_mask = vcutoff_mask & vsame_mask & vmask; @@ -1082,7 +1089,7 @@ void IntelKernelTersoff::kernel_step_const_i( fvec vdy_ik = vy_k - vy_i; fvec vdz_ik = vz_k - vz_i; fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik; - fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k].cutsq); + fvec vcutsq = v::template gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k].cutsq); bvec vcutoff_mask = v::cmplt(vrsq, vcutsq); bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast(4 * sizeof(typename v::fscal) * k))); bvec veff_mask = vcutoff_mask & vsame_mask & vmask; @@ -1228,7 +1235,7 @@ void IntelKernelTersoff::kernel( template -IntelKernelTersoff::fvec IntelKernelTersoff::zeta_vector( +typename IntelKernelTersoff::fvec IntelKernelTersoff::zeta_vector( const c_inner_t * param, ivec xjw, bvec mask, fvec vrij, fvec rsq2, @@ -1354,6 +1361,8 @@ void IntelKernelTersoff::force_zeta_vector( } } +#define BCF lmp_intel::vector_routines + template template void IntelKernelTersoff::attractive_vector( @@ -1393,7 +1402,7 @@ void IntelKernelTersoff::attractive_vector( fvec varg3 = varg1 * varg1 * varg1; bvec mask_ex = v::cmpeq(vppowermint, fvec(3.)); fvec varg = v::blend(mask_ex, varg1, varg3); - fvec vex_delr = min(fvec(1.e30), exp(varg)); + fvec vex_delr = BCF::min(fvec(1.e30), exp(varg)); fvec vex_delr_d_factor = v::blend(mask_ex, v_1_0, fvec(3.0) * varg1 * varg1); fvec vex_delr_d = vplam3 * vex_delr_d_factor * vex_delr; bvec vmask_need_sine = v::cmpnle(vrik, vpbigr - vpbigd) & mask; @@ -1413,12 +1422,12 @@ void IntelKernelTersoff::attractive_vector( if (ZETA) *zeta = vfc * vgijk * vex_delr; fvec vminus_costheta = - vcostheta; - fvec vdcosdrjx = vrijinv * fmadd(vminus_costheta, vrij_hatx, rik_hatx); - fvec vdcosdrjy = vrijinv * fmadd(vminus_costheta, vrij_haty, rik_haty); - fvec vdcosdrjz = vrijinv * fmadd(vminus_costheta, vrij_hatz, rik_hatz); - fvec vdcosdrkx = rikinv * fmadd(vminus_costheta, rik_hatx, vrij_hatx); - fvec vdcosdrky = rikinv * fmadd(vminus_costheta, rik_haty, vrij_haty); - fvec vdcosdrkz = rikinv * fmadd(vminus_costheta, rik_hatz, vrij_hatz); + fvec vdcosdrjx = vrijinv * BCF::fmadd(vminus_costheta, vrij_hatx, rik_hatx); + fvec vdcosdrjy = vrijinv * BCF::fmadd(vminus_costheta, vrij_haty, rik_haty); + fvec vdcosdrjz = vrijinv * BCF::fmadd(vminus_costheta, vrij_hatz, rik_hatz); + fvec vdcosdrkx = rikinv * BCF::fmadd(vminus_costheta, rik_hatx, vrij_hatx); + fvec vdcosdrky = rikinv * BCF::fmadd(vminus_costheta, rik_haty, vrij_haty); + fvec vdcosdrkz = rikinv * BCF::fmadd(vminus_costheta, rik_hatz, vrij_hatz); fvec vdcosdrix = -(vdcosdrjx + vdcosdrkx); fvec vdcosdriy = -(vdcosdrjy + vdcosdrky); fvec vdcosdriz = -(vdcosdrjz + vdcosdrkz); diff --git a/src/lmptype.h b/src/lmptype.h index 6a7a7ef1b9..871bf5ff6c 100644 --- a/src/lmptype.h +++ b/src/lmptype.h @@ -242,9 +242,9 @@ union ubuf { // define stack variable alignment -#if defined(__INTEL_LLVM_COMPILER) || defined(__INTEL_COMPILER) +#if defined(__INTEL_COMPILER) #define _alignvar(expr, val) __declspec(align(val)) expr -#elif defined(__GNUC__) || defined(__PGI) +#elif defined(__GNUC__) || defined(__PGI) || defined(__INTEL_LLVM_COMPILER) #define _alignvar(expr, val) expr __attribute((aligned(val))) #else #define _alignvar(expr, val) expr @@ -266,7 +266,7 @@ union ubuf { #if defined(__clang__) #define _noopt __attribute__((optnone)) -#elif defined(__INTEL_COMPILER) +#elif defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER) #define _noopt #elif defined(__PGI) #define _noopt diff --git a/src/memory.cpp b/src/memory.cpp index 3e67ed1496..8f7faad545 100644 --- a/src/memory.cpp +++ b/src/memory.cpp @@ -16,7 +16,8 @@ #include "error.h" -#if defined(LMP_USER_INTEL) && defined(__INTEL_COMPILER) +#if defined(LMP_USER_INTEL) && \ + ((defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER))) #ifndef LMP_INTEL_NO_TBB #define LMP_USE_TBB_ALLOCATOR #include "tbb/scalable_allocator.h" @@ -81,7 +82,7 @@ void *Memory::srealloc(void *ptr, bigint nbytes, const char *name) #if defined(LMP_USE_TBB_ALLOCATOR) ptr = scalable_aligned_realloc(ptr, nbytes, LAMMPS_MEMALIGN); #elif defined(LMP_INTEL_NO_TBB) && defined(LAMMPS_MEMALIGN) && \ - defined(__INTEL_COMPILER) + (defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)) ptr = realloc(ptr, nbytes); uintptr_t offset = ((uintptr_t)(const void *)(ptr)) % LAMMPS_MEMALIGN;