Fixes to the Intel package to remove errors and warnings with clang-based compiler and support alternate preprocessor defines for the next-gen Intel compiler (currently not recommended for performance)..

This commit is contained in:
Mike Brown
2021-07-19 11:16:09 -07:00
parent b2a396da3c
commit d739c3bede
10 changed files with 63 additions and 42 deletions

View File

@ -85,7 +85,11 @@ struct vector_ops {};
// Intrinsic routines for IMCI and AVX-512
#if defined(__MIC__) || defined(__AVX512F__)
// Integer vector class
#ifdef __INTEL_LLVM_COMPILER
#pragma pack(push,16)
#else
#pragma pack(push,64)
#endif
struct ivec32x16 {
__m512i vec;
ivec32x16() {}
@ -113,7 +117,7 @@ struct vector_ops<double, KNC> {
typedef double fscal;
typedef F64vec8 fvec;
typedef ivec32x16 ivec;
typedef __mmask bvec;
typedef __mmask16 bvec;
typedef double farr[8] __attribute__((aligned(64)));
typedef int iarr[16] __attribute__((aligned(64)));
static fvec recip(const fvec &a) { return _mm512_recip_pd(a); }
@ -250,7 +254,7 @@ struct vector_ops<float, KNC> {
typedef float fscal;
typedef F32vec16 fvec;
typedef ivec32x16 ivec;
typedef __mmask bvec;
typedef __mmask16 bvec;
typedef float farr[16] __attribute__((aligned(64)));
typedef int iarr[16] __attribute__((aligned(64)));
static const bvec full_mask = 0xFFFF;
@ -380,16 +384,18 @@ struct vector_ops<float, KNC> {
*r3 = gather<4>(*r3, mask, idxs, reinterpret_cast<const char *>(base) + 12);
}
// Additional routines needed for the implementation of mixed precision
static fvec cvtdown(const vector_ops<double,KNC>::fvec &lo, const vector_ops<double,KNC>::fvec &hi) {
static fvec cvtdown(const vector_ops<double,KNC>::fvec &lo,
const vector_ops<double,KNC>::fvec &hi) {
__m512 t1 = _mm512_cvtpd_pslo(lo);
__m512 t2 = _mm512_cvtpd_pslo(hi);
return _mm512_mask_permute4f128_ps(t1, 0xFF00, t2, _MM_PERM_BADC);
return _mm512_mask_shuffle_f32x4(_mm512_undefined_ps(), 0xFF00, t2, t2,
0x4E);
}
static vector_ops<double,KNC>::fvec cvtup_lo(const fvec &a) {
return _mm512_cvtpslo_pd(a);
}
static vector_ops<double,KNC>::fvec cvtup_hi(const fvec &a) {
return _mm512_cvtpslo_pd(_mm512_permute4f128_ps(a, _MM_PERM_BADC)); // permute DCBA -> BADC
return _mm512_cvtpslo_pd(_mm512_shuffle_f32x4(a, a, 0x4E));
}
static void mask_cvtup(const bvec &a, vector_ops<double,KNC>::bvec *blo, vector_ops<double,KNC>::bvec *bhi) {
*blo = a & 0xFF;
@ -1692,7 +1698,7 @@ struct vector_ops<flt_t, NONE> {
typedef flt_t fscal;
typedef flt_t fvec;
typedef int ivec;
typedef bool bvec;
typedef int bvec;
typedef flt_t farr[1];
typedef int iarr[1];
static fvec recip(const fvec &a) {

View File

@ -651,12 +651,12 @@ class avec16pd {
return a >> 8;
}
VEC_INLINE static __m512i get_ivec_hi(__m512i a) {
return _mm512_permute4f128_epi32(a, _MM_PERM_BADC);
return _mm512_shuffle_i32x4(a, a, 0x4E);
}
public:
VEC_INLINE avec16pd(const FVEC_NAME &a) {
lo_ = _mm512_cvtpslo_pd(a.val_);
hi_ = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(a.val_, _MM_PERM_BADC));
hi_ = _mm512_cvtpslo_pd(_mm512_shuffle_f32x4(a.val_, a.val_, 0x4E));
}
VEC_INLINE static avec16pd undefined() {
return avec16pd(_mm512_undefined_pd(), _mm512_undefined_pd());

View File

@ -16,6 +16,11 @@
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#ifdef __INTEL_LLVM_COMPILER
#define __INTEL_COMPILER __INTEL_LLVM_COMPILER
#define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER
#endif
#ifdef __INTEL_COMPILER
#define LMP_SIMD_COMPILER
#if (__INTEL_COMPILER_BUILD_DATE > 20160720)

View File

@ -2332,7 +2332,7 @@ static void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
int n_skin = 0;
int lowest_idx;
#pragma unroll(4)
//#pragma unroll(4)
for (lowest_idx = 0; lowest_idx < jnum; lowest_idx += fvec::VL) {
bvec j_mask = bvec::full();
if (lowest_idx + fvec::VL > jnum) j_mask = bvec::only(jnum - lowest_idx);

View File

@ -284,7 +284,7 @@ void PairDPDIntel::eval(const int offload, const int vflag,
}
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned nog2s
#pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif

View File

@ -306,7 +306,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
acc_t rhoi = (acc_t)0.0;
int ej = 0;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned nog2s
#pragma vector aligned
#pragma ivdep
#endif
for (int jj = 0; jj < jnum; jj++) {
@ -325,7 +325,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
}
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned nog2s
#pragma vector aligned
#pragma simd reduction(+:rhoi)
#endif
for (int jj = 0; jj < ej; jj++) {
@ -412,7 +412,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
if (EFLAG) tevdwl = (acc_t)0.0;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned nog2s
#pragma vector aligned
#pragma simd reduction(+:tevdwl)
#endif
for (int ii = iifrom; ii < iito; ++ii) {
@ -486,7 +486,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
int ej = 0;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned nog2s
#pragma vector aligned
#pragma ivdep
#endif
for (int jj = 0; jj < jnum; jj++) {
@ -508,7 +508,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
}
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned nog2s
#pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif

View File

@ -237,7 +237,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned nog2s
#pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif

View File

@ -367,11 +367,11 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops;
bool use_scalar = VL < 4;
if (use_scalar) {
IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::kernel<EFLAG>(ARGS);
IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::template kernel<EFLAG>(ARGS);
} else if (pack_i) {
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::kernel<EFLAG>(ARGS);
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::template kernel<EFLAG>(ARGS);
} else {
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EFLAG>(ARGS);
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::template kernel<EFLAG>(ARGS);
}
if (EFLAG) oevdwl += sevdwl;
}
@ -691,7 +691,8 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
fvec vrijsq = vdx_ij * vdx_ij + vdy_ij * vdy_ij + vdz_ij * vdz_ij;
fvec vrij = sqrt(vrijsq);
ivec vis_orig = v::int_load_vl(is);
ivec vnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, numneigh);
ivec vnumneigh_i = v::template int_gather<4>(v_i0, vmask, vis_orig,
numneigh);
ivec vc_idx_ij = v::int_mullo(v_i4floats, vw_j + v::int_mullo(v_i_ntypes, vw_i));
fvec vzeta = v::zero();
@ -718,14 +719,16 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
while (! v::mask_testz(vactive_mask) && cache_idx < N_CACHE) {
bvec vnew_mask = vactive_mask & ~ veff_old_mask;
vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
(v::template int_gather<4>(vks, vactive_mask,
vkks + vcnumneigh_i,
firstneigh)));
v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
fvec vdx_ik = (vx_k - vx_i);
fvec vdy_ik = (vy_k - vy_i);
fvec vdz_ik = (vz_k - vz_i);
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
vcutsq = v::template gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
bvec vsame_mask = v::int_cmpneq(vjs, vks);
bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
@ -769,14 +772,16 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
while (! v::mask_testz(vactive_mask)) {
bvec vnew_mask = vactive_mask & ~ veff_old_mask;
vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
(v::template int_gather<4>(vks, vactive_mask,
vkks + vcnumneigh_i,
firstneigh)));
v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
fvec vdx_ik = (vx_k - vx_i);
fvec vdy_ik = (vy_k - vy_i);
fvec vdz_ik = (vz_k - vz_i);
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
vcutsq = v::template gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
bvec vsame_mask = v::int_cmpneq(vjs, vks);
bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
@ -836,14 +841,16 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
while (! v::mask_testz(vactive_mask)) {
bvec vnew_mask = vactive_mask & ~ veff_old_mask;
vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
(v::template int_gather<4>(vks, vactive_mask,
vkks + vcnumneigh_i,
firstneigh)));
v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
fvec vdx_ik = vx_k - vx_i;
fvec vdy_ik = vy_k - vy_i;
fvec vdz_ik = vz_k - vz_i;
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
vcutsq = v::template gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
bvec vsame_mask = v::int_cmpneq(vjs, vks);
bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
@ -991,7 +998,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
fvec vdy_ik = vy_k - vy_i;
fvec vdz_ik = vz_k - vz_i;
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]);
fvec vcutsq = v::template gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]);
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
@ -1035,7 +1042,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
fvec vdy_ik = vy_k - vy_i;
fvec vdz_ik = vz_k - vz_i;
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]);
fvec vcutsq = v::template gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]);
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
@ -1082,7 +1089,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
fvec vdy_ik = vy_k - vy_i;
fvec vdz_ik = vz_k - vz_i;
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k].cutsq);
fvec vcutsq = v::template gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k].cutsq);
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
@ -1228,7 +1235,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::zeta_vector(
typename IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::zeta_vector(
const c_inner_t * param,
ivec xjw, bvec mask,
fvec vrij, fvec rsq2,
@ -1354,6 +1361,8 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::force_zeta_vector(
}
}
#define BCF lmp_intel::vector_routines<flt_t, acc_t, mic>
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
template<bool ZETA>
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
@ -1393,7 +1402,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
fvec varg3 = varg1 * varg1 * varg1;
bvec mask_ex = v::cmpeq(vppowermint, fvec(3.));
fvec varg = v::blend(mask_ex, varg1, varg3);
fvec vex_delr = min(fvec(1.e30), exp(varg));
fvec vex_delr = BCF::min(fvec(1.e30), exp(varg));
fvec vex_delr_d_factor = v::blend(mask_ex, v_1_0, fvec(3.0) * varg1 * varg1);
fvec vex_delr_d = vplam3 * vex_delr_d_factor * vex_delr;
bvec vmask_need_sine = v::cmpnle(vrik, vpbigr - vpbigd) & mask;
@ -1413,12 +1422,12 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
if (ZETA) *zeta = vfc * vgijk * vex_delr;
fvec vminus_costheta = - vcostheta;
fvec vdcosdrjx = vrijinv * fmadd(vminus_costheta, vrij_hatx, rik_hatx);
fvec vdcosdrjy = vrijinv * fmadd(vminus_costheta, vrij_haty, rik_haty);
fvec vdcosdrjz = vrijinv * fmadd(vminus_costheta, vrij_hatz, rik_hatz);
fvec vdcosdrkx = rikinv * fmadd(vminus_costheta, rik_hatx, vrij_hatx);
fvec vdcosdrky = rikinv * fmadd(vminus_costheta, rik_haty, vrij_haty);
fvec vdcosdrkz = rikinv * fmadd(vminus_costheta, rik_hatz, vrij_hatz);
fvec vdcosdrjx = vrijinv * BCF::fmadd(vminus_costheta, vrij_hatx, rik_hatx);
fvec vdcosdrjy = vrijinv * BCF::fmadd(vminus_costheta, vrij_haty, rik_haty);
fvec vdcosdrjz = vrijinv * BCF::fmadd(vminus_costheta, vrij_hatz, rik_hatz);
fvec vdcosdrkx = rikinv * BCF::fmadd(vminus_costheta, rik_hatx, vrij_hatx);
fvec vdcosdrky = rikinv * BCF::fmadd(vminus_costheta, rik_haty, vrij_haty);
fvec vdcosdrkz = rikinv * BCF::fmadd(vminus_costheta, rik_hatz, vrij_hatz);
fvec vdcosdrix = -(vdcosdrjx + vdcosdrkx);
fvec vdcosdriy = -(vdcosdrjy + vdcosdrky);
fvec vdcosdriz = -(vdcosdrjz + vdcosdrkz);

View File

@ -242,9 +242,9 @@ union ubuf {
// define stack variable alignment
#if defined(__INTEL_LLVM_COMPILER) || defined(__INTEL_COMPILER)
#if defined(__INTEL_COMPILER)
#define _alignvar(expr, val) __declspec(align(val)) expr
#elif defined(__GNUC__) || defined(__PGI)
#elif defined(__GNUC__) || defined(__PGI) || defined(__INTEL_LLVM_COMPILER)
#define _alignvar(expr, val) expr __attribute((aligned(val)))
#else
#define _alignvar(expr, val) expr
@ -266,7 +266,7 @@ union ubuf {
#if defined(__clang__)
#define _noopt __attribute__((optnone))
#elif defined(__INTEL_COMPILER)
#elif defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)
#define _noopt
#elif defined(__PGI)
#define _noopt

View File

@ -16,7 +16,8 @@
#include "error.h"
#if defined(LMP_USER_INTEL) && defined(__INTEL_COMPILER)
#if defined(LMP_USER_INTEL) && \
((defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)))
#ifndef LMP_INTEL_NO_TBB
#define LMP_USE_TBB_ALLOCATOR
#include "tbb/scalable_allocator.h"
@ -81,7 +82,7 @@ void *Memory::srealloc(void *ptr, bigint nbytes, const char *name)
#if defined(LMP_USE_TBB_ALLOCATOR)
ptr = scalable_aligned_realloc(ptr, nbytes, LAMMPS_MEMALIGN);
#elif defined(LMP_INTEL_NO_TBB) && defined(LAMMPS_MEMALIGN) && \
defined(__INTEL_COMPILER)
(defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER))
ptr = realloc(ptr, nbytes);
uintptr_t offset = ((uintptr_t)(const void *)(ptr)) % LAMMPS_MEMALIGN;