massive whitespace cleanup in USER-INTEL

removed are:
- DOS/Windows text format carriage return characters (^M)
- tabs replaced with spaces (tabs are evil!!)
- trailing whitespace
This commit is contained in:
Axel Kohlmeyer
2017-06-19 13:23:01 -04:00
parent b687d16177
commit 3c329d1707
62 changed files with 7890 additions and 7890 deletions

View File

@ -37,7 +37,7 @@ typedef struct { int a,b,c,t; } int4_t;
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
AngleCharmmIntel::AngleCharmmIntel(LAMMPS *lmp) : AngleCharmm(lmp) AngleCharmmIntel::AngleCharmmIntel(LAMMPS *lmp) : AngleCharmm(lmp)
{ {
suffix_flag |= Suffix::INTEL; suffix_flag |= Suffix::INTEL;
} }
@ -74,8 +74,8 @@ void AngleCharmmIntel::compute(int eflag, int vflag)
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void AngleCharmmIntel::compute(int eflag, int vflag, void AngleCharmmIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
if (eflag || vflag) ev_setup(eflag,vflag); if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = 0; else evflag = 0;
@ -83,14 +83,14 @@ void AngleCharmmIntel::compute(int eflag, int vflag,
if (evflag) { if (evflag) {
if (vflag && !eflag) { if (vflag && !eflag) {
if (force->newton_bond) if (force->newton_bond)
eval<0,1,1>(vflag, buffers, fc); eval<0,1,1>(vflag, buffers, fc);
else else
eval<0,1,0>(vflag, buffers, fc); eval<0,1,0>(vflag, buffers, fc);
} else { } else {
if (force->newton_bond) if (force->newton_bond)
eval<1,1,1>(vflag, buffers, fc); eval<1,1,1>(vflag, buffers, fc);
else else
eval<1,1,0>(vflag, buffers, fc); eval<1,1,0>(vflag, buffers, fc);
} }
} else { } else {
if (force->newton_bond) if (force->newton_bond)
@ -103,9 +103,9 @@ void AngleCharmmIntel::compute(int eflag, int vflag,
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void AngleCharmmIntel::eval(const int vflag, void AngleCharmmIntel::eval(const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
const int inum = neighbor->nanglelist; const int inum = neighbor->nanglelist;
@ -133,7 +133,7 @@ void AngleCharmmIntel::eval(const int vflag,
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp parallel default(none) \ #pragma omp parallel default(none) \
shared(f_start,f_stride,fc) \ shared(f_start,f_stride,fc) \
reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5) reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
#endif #endif
{ {
@ -148,7 +148,7 @@ void AngleCharmmIntel::eval(const int vflag,
if (fix->need_zero(tid)) if (fix->need_zero(tid))
memset(f, 0, f_stride * sizeof(FORCE_T)); memset(f, 0, f_stride * sizeof(FORCE_T));
const int4_t * _noalias const anglelist = const int4_t * _noalias const anglelist =
(int4_t *) neighbor->anglelist[0]; (int4_t *) neighbor->anglelist[0];
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
@ -246,35 +246,35 @@ void AngleCharmmIntel::eval(const int vflag,
{ {
if (NEWTON_BOND || i1 < nlocal) { if (NEWTON_BOND || i1 < nlocal) {
f[i1].x += f1x; f[i1].x += f1x;
f[i1].y += f1y; f[i1].y += f1y;
f[i1].z += f1z; f[i1].z += f1z;
} }
if (NEWTON_BOND || i2 < nlocal) { if (NEWTON_BOND || i2 < nlocal) {
f[i2].x -= f1x + f3x; f[i2].x -= f1x + f3x;
f[i2].y -= f1y + f3y; f[i2].y -= f1y + f3y;
f[i2].z -= f1z + f3z; f[i2].z -= f1z + f3z;
} }
if (NEWTON_BOND || i3 < nlocal) { if (NEWTON_BOND || i3 < nlocal) {
f[i3].x += f3x; f[i3].x += f3x;
f[i3].y += f3y; f[i3].y += f3y;
f[i3].z += f3z; f[i3].z += f3z;
} }
} }
if (EFLAG || VFLAG) { if (EFLAG || VFLAG) {
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2,
i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1, i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1,
dely1, delz1, delx2, dely2, delz2, seangle, dely1, delz1, delx2, dely2, delz2, seangle,
f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3,
sv4, sv5); sv4, sv5);
#else #else
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2,
i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1, i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1,
dely1, delz1, delx2, dely2, delz2, oeangle, dely1, delz1, delx2, dely2, delz2, oeangle,
f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3,
ov4, ov5); ov4, ov5);
#endif #endif
} }
@ -282,8 +282,8 @@ void AngleCharmmIntel::eval(const int vflag,
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
if (EFLAG) oeangle += seangle; if (EFLAG) oeangle += seangle;
if (VFLAG && vflag) { if (VFLAG && vflag) {
ov0 += sv0; ov1 += sv1; ov2 += sv2; ov0 += sv0; ov1 += sv1; ov2 += sv2;
ov3 += sv3; ov4 += sv4; ov5 += sv5; ov3 += sv3; ov4 += sv4; ov5 += sv5;
} }
#endif #endif
} // omp parallel } // omp parallel
@ -291,7 +291,7 @@ void AngleCharmmIntel::eval(const int vflag,
if (EFLAG) energy += oeangle; if (EFLAG) energy += oeangle;
if (VFLAG && vflag) { if (VFLAG && vflag) {
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
} }
fix->set_reduce_flag(); fix->set_reduce_flag();
@ -348,11 +348,11 @@ void AngleCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
template <class flt_t> template <class flt_t>
void AngleCharmmIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes, void AngleCharmmIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes,
Memory *memory) { Memory *memory) {
if (nangletypes != _nangletypes) { if (nangletypes != _nangletypes) {
if (_nangletypes > 0) if (_nangletypes > 0)
_memory->destroy(fc); _memory->destroy(fc);
if (nangletypes > 0) if (nangletypes > 0)
_memory->create(fc,nangletypes,"anglecharmmintel.fc"); _memory->create(fc,nangletypes,"anglecharmmintel.fc");
} }

View File

@ -45,8 +45,8 @@ class AngleCharmmIntel : public AngleCharmm {
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc, void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers); IntelBuffers<flt_t, acc_t> *buffers);

View File

@ -37,7 +37,7 @@ typedef struct { int a,b,c,t; } int4_t;
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
AngleHarmonicIntel::AngleHarmonicIntel(LAMMPS *lmp) : AngleHarmonic(lmp) AngleHarmonicIntel::AngleHarmonicIntel(LAMMPS *lmp) : AngleHarmonic(lmp)
{ {
suffix_flag |= Suffix::INTEL; suffix_flag |= Suffix::INTEL;
} }
@ -74,8 +74,8 @@ void AngleHarmonicIntel::compute(int eflag, int vflag)
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void AngleHarmonicIntel::compute(int eflag, int vflag, void AngleHarmonicIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
if (eflag || vflag) ev_setup(eflag,vflag); if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = 0; else evflag = 0;
@ -83,14 +83,14 @@ void AngleHarmonicIntel::compute(int eflag, int vflag,
if (evflag) { if (evflag) {
if (vflag && !eflag) { if (vflag && !eflag) {
if (force->newton_bond) if (force->newton_bond)
eval<0,1,1>(vflag, buffers, fc); eval<0,1,1>(vflag, buffers, fc);
else else
eval<0,1,0>(vflag, buffers, fc); eval<0,1,0>(vflag, buffers, fc);
} else { } else {
if (force->newton_bond) if (force->newton_bond)
eval<1,1,1>(vflag, buffers, fc); eval<1,1,1>(vflag, buffers, fc);
else else
eval<1,1,0>(vflag, buffers, fc); eval<1,1,0>(vflag, buffers, fc);
} }
} else { } else {
if (force->newton_bond) if (force->newton_bond)
@ -103,9 +103,9 @@ void AngleHarmonicIntel::compute(int eflag, int vflag,
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void AngleHarmonicIntel::eval(const int vflag, void AngleHarmonicIntel::eval(const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
const int inum = neighbor->nanglelist; const int inum = neighbor->nanglelist;
@ -133,7 +133,7 @@ void AngleHarmonicIntel::eval(const int vflag,
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp parallel default(none) \ #pragma omp parallel default(none) \
shared(f_start,f_stride,fc) \ shared(f_start,f_stride,fc) \
reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5) reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
#endif #endif
{ {
@ -148,7 +148,7 @@ void AngleHarmonicIntel::eval(const int vflag,
if (fix->need_zero(tid)) if (fix->need_zero(tid))
memset(f, 0, f_stride * sizeof(FORCE_T)); memset(f, 0, f_stride * sizeof(FORCE_T));
const int4_t * _noalias const anglelist = const int4_t * _noalias const anglelist =
(int4_t *) neighbor->anglelist[0]; (int4_t *) neighbor->anglelist[0];
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
@ -228,35 +228,35 @@ void AngleHarmonicIntel::eval(const int vflag,
{ {
if (NEWTON_BOND || i1 < nlocal) { if (NEWTON_BOND || i1 < nlocal) {
f[i1].x += f1x; f[i1].x += f1x;
f[i1].y += f1y; f[i1].y += f1y;
f[i1].z += f1z; f[i1].z += f1z;
} }
if (NEWTON_BOND || i2 < nlocal) { if (NEWTON_BOND || i2 < nlocal) {
f[i2].x -= f1x + f3x; f[i2].x -= f1x + f3x;
f[i2].y -= f1y + f3y; f[i2].y -= f1y + f3y;
f[i2].z -= f1z + f3z; f[i2].z -= f1z + f3z;
} }
if (NEWTON_BOND || i3 < nlocal) { if (NEWTON_BOND || i3 < nlocal) {
f[i3].x += f3x; f[i3].x += f3x;
f[i3].y += f3y; f[i3].y += f3y;
f[i3].z += f3z; f[i3].z += f3z;
} }
} }
if (EFLAG || VFLAG) { if (EFLAG || VFLAG) {
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3, IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1, f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1,
delz1, delx2, dely2, delz2, seangle, f, delz1, delx2, dely2, delz2, seangle, f,
NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4,
sv5); sv5);
#else #else
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3, IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1, f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1,
delz1, delx2, dely2, delz2, oeangle, f, delz1, delx2, dely2, delz2, oeangle, f,
NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4,
ov5); ov5);
#endif #endif
} }
@ -264,8 +264,8 @@ void AngleHarmonicIntel::eval(const int vflag,
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
if (EFLAG) oeangle += seangle; if (EFLAG) oeangle += seangle;
if (VFLAG && vflag) { if (VFLAG && vflag) {
ov0 += sv0; ov1 += sv1; ov2 += sv2; ov0 += sv0; ov1 += sv1; ov2 += sv2;
ov3 += sv3; ov4 += sv4; ov5 += sv5; ov3 += sv3; ov4 += sv4; ov5 += sv5;
} }
#endif #endif
} // omp parallel } // omp parallel
@ -273,7 +273,7 @@ void AngleHarmonicIntel::eval(const int vflag,
if (EFLAG) energy += oeangle; if (EFLAG) energy += oeangle;
if (VFLAG && vflag) { if (VFLAG && vflag) {
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
} }
fix->set_reduce_flag(); fix->set_reduce_flag();
@ -328,11 +328,11 @@ void AngleHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
template <class flt_t> template <class flt_t>
void AngleHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes, void AngleHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes,
Memory *memory) { Memory *memory) {
if (nangletypes != _nangletypes) { if (nangletypes != _nangletypes) {
if (_nangletypes > 0) if (_nangletypes > 0)
_memory->destroy(fc); _memory->destroy(fc);
if (nangletypes > 0) if (nangletypes > 0)
_memory->create(fc,nangletypes,"anglecharmmintel.fc"); _memory->create(fc,nangletypes,"anglecharmmintel.fc");
} }

View File

@ -45,8 +45,8 @@ class AngleHarmonicIntel : public AngleHarmonic {
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc, void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers); IntelBuffers<flt_t, acc_t> *buffers);

View File

@ -33,7 +33,7 @@ typedef struct { int a,b,t; } int3_t;
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
BondFENEIntel::BondFENEIntel(LAMMPS *lmp) : BondFENE(lmp) BondFENEIntel::BondFENEIntel(LAMMPS *lmp) : BondFENE(lmp)
{ {
suffix_flag |= Suffix::INTEL; suffix_flag |= Suffix::INTEL;
} }
@ -70,8 +70,8 @@ void BondFENEIntel::compute(int eflag, int vflag)
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void BondFENEIntel::compute(int eflag, int vflag, void BondFENEIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
if (eflag || vflag) ev_setup(eflag,vflag); if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = 0; else evflag = 0;
@ -79,14 +79,14 @@ void BondFENEIntel::compute(int eflag, int vflag,
if (evflag) { if (evflag) {
if (vflag && !eflag) { if (vflag && !eflag) {
if (force->newton_bond) if (force->newton_bond)
eval<0,1,1>(vflag, buffers, fc); eval<0,1,1>(vflag, buffers, fc);
else else
eval<0,1,0>(vflag, buffers, fc); eval<0,1,0>(vflag, buffers, fc);
} else { } else {
if (force->newton_bond) if (force->newton_bond)
eval<1,1,1>(vflag, buffers, fc); eval<1,1,1>(vflag, buffers, fc);
else else
eval<1,1,0>(vflag, buffers, fc); eval<1,1,0>(vflag, buffers, fc);
} }
} else { } else {
if (force->newton_bond) if (force->newton_bond)
@ -97,9 +97,9 @@ void BondFENEIntel::compute(int eflag, int vflag,
} }
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void BondFENEIntel::eval(const int vflag, void BondFENEIntel::eval(const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
const int inum = neighbor->nbondlist; const int inum = neighbor->nbondlist;
if (inum == 0) return; if (inum == 0) return;
@ -126,7 +126,7 @@ void BondFENEIntel::eval(const int vflag,
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp parallel default(none) \ #pragma omp parallel default(none) \
shared(f_start,f_stride,fc) \ shared(f_start,f_stride,fc) \
reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5) reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
#endif #endif
{ {
@ -141,7 +141,7 @@ void BondFENEIntel::eval(const int vflag,
if (fix->need_zero(tid)) if (fix->need_zero(tid))
memset(f, 0, f_stride * sizeof(FORCE_T)); memset(f, 0, f_stride * sizeof(FORCE_T));
const int3_t * _noalias const bondlist = const int3_t * _noalias const bondlist =
(int3_t *) neighbor->bondlist[0]; (int3_t *) neighbor->bondlist[0];
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
@ -176,7 +176,7 @@ void BondFENEIntel::eval(const int vflag,
// if r -> r0, then rlogarg < 0.0 which is an error // if r -> r0, then rlogarg < 0.0 which is an error
// issue a warning and reset rlogarg = epsilon // issue a warning and reset rlogarg = epsilon
// if r > 2*r0 something serious is wrong, abort // if r > 2*r0 something serious is wrong, abort
if (rlogarg < (flt_t)0.1) { if (rlogarg < (flt_t)0.1) {
char str[128]; char str[128];
sprintf(str,"FENE bond too long: " BIGINT_FORMAT " " sprintf(str,"FENE bond too long: " BIGINT_FORMAT " "
@ -186,18 +186,18 @@ void BondFENEIntel::eval(const int vflag,
if (rlogarg <= (flt_t)-3.0) error->one(FLERR,"Bad FENE bond"); if (rlogarg <= (flt_t)-3.0) error->one(FLERR,"Bad FENE bond");
rlogarg = (flt_t)0.1; rlogarg = (flt_t)0.1;
} }
flt_t fbond = -k/rlogarg; flt_t fbond = -k/rlogarg;
// force from LJ term // force from LJ term
flt_t sr2,sr6; flt_t sr2,sr6;
if (rsq < (flt_t)TWO_1_3*sigmasq) { if (rsq < (flt_t)TWO_1_3*sigmasq) {
sr2 = sigmasq * irsq; sr2 = sigmasq * irsq;
sr6 = sr2 * sr2 * sr2; sr6 = sr2 * sr2 * sr2;
fbond += (flt_t)48.0 * epsilon * sr6 * (sr6 - (flt_t)0.5) * irsq; fbond += (flt_t)48.0 * epsilon * sr6 * (sr6 - (flt_t)0.5) * irsq;
} }
// energy // energy
flt_t ebond; flt_t ebond;
@ -215,27 +215,27 @@ void BondFENEIntel::eval(const int vflag,
{ {
if (NEWTON_BOND || i1 < nlocal) { if (NEWTON_BOND || i1 < nlocal) {
f[i1].x += delx*fbond; f[i1].x += delx*fbond;
f[i1].y += dely*fbond; f[i1].y += dely*fbond;
f[i1].z += delz*fbond; f[i1].z += delz*fbond;
} }
if (NEWTON_BOND || i2 < nlocal) { if (NEWTON_BOND || i2 < nlocal) {
f[i2].x -= delx*fbond; f[i2].x -= delx*fbond;
f[i2].y -= dely*fbond; f[i2].y -= dely*fbond;
f[i2].z -= delz*fbond; f[i2].z -= delz*fbond;
} }
} }
if (EFLAG || VFLAG) { if (EFLAG || VFLAG) {
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond, IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond,
delx, dely, delz, sebond, f, NEWTON_BOND, delx, dely, delz, sebond, f, NEWTON_BOND,
nlocal, sv0, sv1, sv2, sv3, sv4, sv5); nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
#else #else
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond, IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond,
delx, dely, delz, oebond, f, NEWTON_BOND, delx, dely, delz, oebond, f, NEWTON_BOND,
nlocal, ov0, ov1, ov2, ov3, ov4, ov5); nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
#endif #endif
} }
} // for n } // for n
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
@ -250,7 +250,7 @@ void BondFENEIntel::eval(const int vflag,
if (EFLAG) energy += oebond; if (EFLAG) energy += oebond;
if (VFLAG && vflag) { if (VFLAG && vflag) {
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
} }
fix->set_reduce_flag(); fix->set_reduce_flag();
@ -307,11 +307,11 @@ void BondFENEIntel::pack_force_const(ForceConst<flt_t> &fc,
template <class flt_t> template <class flt_t>
void BondFENEIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes, void BondFENEIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
Memory *memory) { Memory *memory) {
if (nbondtypes != _nbondtypes) { if (nbondtypes != _nbondtypes) {
if (_nbondtypes > 0) if (_nbondtypes > 0)
_memory->destroy(fc); _memory->destroy(fc);
if (nbondtypes > 0) if (nbondtypes > 0)
_memory->create(fc,nbondtypes,"bondfeneintel.fc"); _memory->create(fc,nbondtypes,"bondfeneintel.fc");
} }

View File

@ -45,8 +45,8 @@ class BondFENEIntel : public BondFENE {
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc, void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers); IntelBuffers<flt_t, acc_t> *buffers);

View File

@ -33,7 +33,7 @@ typedef struct { int a,b,t; } int3_t;
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
BondHarmonicIntel::BondHarmonicIntel(LAMMPS *lmp) : BondHarmonic(lmp) BondHarmonicIntel::BondHarmonicIntel(LAMMPS *lmp) : BondHarmonic(lmp)
{ {
suffix_flag |= Suffix::INTEL; suffix_flag |= Suffix::INTEL;
} }
@ -70,8 +70,8 @@ void BondHarmonicIntel::compute(int eflag, int vflag)
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void BondHarmonicIntel::compute(int eflag, int vflag, void BondHarmonicIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
if (eflag || vflag) ev_setup(eflag,vflag); if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = 0; else evflag = 0;
@ -79,14 +79,14 @@ void BondHarmonicIntel::compute(int eflag, int vflag,
if (evflag) { if (evflag) {
if (vflag && !eflag) { if (vflag && !eflag) {
if (force->newton_bond) if (force->newton_bond)
eval<0,1,1>(vflag, buffers, fc); eval<0,1,1>(vflag, buffers, fc);
else else
eval<0,1,0>(vflag, buffers, fc); eval<0,1,0>(vflag, buffers, fc);
} else { } else {
if (force->newton_bond) if (force->newton_bond)
eval<1,1,1>(vflag, buffers, fc); eval<1,1,1>(vflag, buffers, fc);
else else
eval<1,1,0>(vflag, buffers, fc); eval<1,1,0>(vflag, buffers, fc);
} }
} else { } else {
if (force->newton_bond) if (force->newton_bond)
@ -97,9 +97,9 @@ void BondHarmonicIntel::compute(int eflag, int vflag,
} }
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void BondHarmonicIntel::eval(const int vflag, void BondHarmonicIntel::eval(const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
const int inum = neighbor->nbondlist; const int inum = neighbor->nbondlist;
if (inum == 0) return; if (inum == 0) return;
@ -126,7 +126,7 @@ void BondHarmonicIntel::eval(const int vflag,
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp parallel default(none) \ #pragma omp parallel default(none) \
shared(f_start,f_stride,fc) \ shared(f_start,f_stride,fc) \
reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5) reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
#endif #endif
{ {
@ -141,7 +141,7 @@ void BondHarmonicIntel::eval(const int vflag,
if (fix->need_zero(tid)) if (fix->need_zero(tid))
memset(f, 0, f_stride * sizeof(FORCE_T)); memset(f, 0, f_stride * sizeof(FORCE_T));
const int3_t * _noalias const bondlist = const int3_t * _noalias const bondlist =
(int3_t *) neighbor->bondlist[0]; (int3_t *) neighbor->bondlist[0];
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
@ -184,29 +184,29 @@ void BondHarmonicIntel::eval(const int vflag,
{ {
if (NEWTON_BOND || i1 < nlocal) { if (NEWTON_BOND || i1 < nlocal) {
f[i1].x += delx*fbond; f[i1].x += delx*fbond;
f[i1].y += dely*fbond; f[i1].y += dely*fbond;
f[i1].z += delz*fbond; f[i1].z += delz*fbond;
} }
if (NEWTON_BOND || i2 < nlocal) { if (NEWTON_BOND || i2 < nlocal) {
f[i2].x -= delx*fbond; f[i2].x -= delx*fbond;
f[i2].y -= dely*fbond; f[i2].y -= dely*fbond;
f[i2].z -= delz*fbond; f[i2].z -= delz*fbond;
} }
} }
if (EFLAG || VFLAG) { if (EFLAG || VFLAG) {
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2,
fbond, delx, dely, delz, sebond, f, fbond, delx, dely, delz, sebond, f,
NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3,
sv4, sv5); sv4, sv5);
#else #else
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2,
fbond, delx, dely, delz, oebond, f, fbond, delx, dely, delz, oebond, f,
NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3,
ov4, ov5); ov4, ov5);
#endif #endif
} }
} // for n } // for n
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
@ -221,7 +221,7 @@ void BondHarmonicIntel::eval(const int vflag,
if (EFLAG) energy += oebond; if (EFLAG) energy += oebond;
if (VFLAG && vflag) { if (VFLAG && vflag) {
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
} }
fix->set_reduce_flag(); fix->set_reduce_flag();
@ -276,11 +276,11 @@ void BondHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
template <class flt_t> template <class flt_t>
void BondHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes, void BondHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
Memory *memory) { Memory *memory) {
if (nbondtypes != _nbondtypes) { if (nbondtypes != _nbondtypes) {
if (_nbondtypes > 0) if (_nbondtypes > 0)
_memory->destroy(fc); _memory->destroy(fc);
if (nbondtypes > 0) if (nbondtypes > 0)
_memory->create(fc,nbondtypes,"bondharmonicintel.fc"); _memory->create(fc,nbondtypes,"bondharmonicintel.fc");
} }

View File

@ -45,8 +45,8 @@ class BondHarmonicIntel : public BondHarmonic {
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc, void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers); IntelBuffers<flt_t, acc_t> *buffers);

View File

@ -80,8 +80,8 @@ void DihedralCharmmIntel::compute(int eflag, int vflag)
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void DihedralCharmmIntel::compute(int eflag, int vflag, void DihedralCharmmIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
if (eflag || vflag) { if (eflag || vflag) {
ev_setup(eflag,vflag); ev_setup(eflag,vflag);
@ -95,14 +95,14 @@ void DihedralCharmmIntel::compute(int eflag, int vflag,
if (evflag) { if (evflag) {
if (vflag && !eflag) { if (vflag && !eflag) {
if (force->newton_bond) if (force->newton_bond)
eval<0,1,1>(vflag, buffers, fc); eval<0,1,1>(vflag, buffers, fc);
else else
eval<0,1,0>(vflag, buffers, fc); eval<0,1,0>(vflag, buffers, fc);
} else { } else {
if (force->newton_bond) if (force->newton_bond)
eval<1,1,1>(vflag, buffers, fc); eval<1,1,1>(vflag, buffers, fc);
else else
eval<1,1,0>(vflag, buffers, fc); eval<1,1,0>(vflag, buffers, fc);
} }
} else { } else {
if (force->newton_bond) if (force->newton_bond)
@ -115,9 +115,9 @@ void DihedralCharmmIntel::compute(int eflag, int vflag,
#ifndef LMP_USE_AVXCD_DHC #ifndef LMP_USE_AVXCD_DHC
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void DihedralCharmmIntel::eval(const int vflag, void DihedralCharmmIntel::eval(const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
const int inum = neighbor->ndihedrallist; const int inum = neighbor->ndihedrallist;
@ -148,9 +148,9 @@ void DihedralCharmmIntel::eval(const int vflag,
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp parallel default(none) \ #pragma omp parallel default(none) \
shared(f_start,f_stride,fc) \ shared(f_start,f_stride,fc) \
reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \ reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
opv0,opv1,opv2,opv3,opv4,opv5) opv0,opv1,opv2,opv3,opv4,opv5)
#endif #endif
{ {
#if defined(LMP_SIMD_COMPILER_TEST) #if defined(LMP_SIMD_COMPILER_TEST)
@ -165,7 +165,7 @@ void DihedralCharmmIntel::eval(const int vflag,
if (fix->need_zero(tid)) if (fix->need_zero(tid))
memset(f, 0, f_stride * sizeof(FORCE_T)); memset(f, 0, f_stride * sizeof(FORCE_T));
const int5_t * _noalias const dihedrallist = const int5_t * _noalias const dihedrallist =
(int5_t *) neighbor->dihedrallist[0]; (int5_t *) neighbor->dihedrallist[0];
const flt_t qqrd2e = force->qqrd2e; const flt_t qqrd2e = force->qqrd2e;
@ -180,7 +180,7 @@ void DihedralCharmmIntel::eval(const int vflag,
#if defined(LMP_SIMD_COMPILER_TEST) #if defined(LMP_SIMD_COMPILER_TEST)
#pragma vector aligned #pragma vector aligned
#pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \ #pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5) sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5)
for (int n = nfrom; n < nto; n++) { for (int n = nfrom; n < nto; n++) {
#endif #endif
for (int n = nfrom; n < nto; n += npl) { for (int n = nfrom; n < nto; n += npl) {
@ -204,7 +204,7 @@ void DihedralCharmmIntel::eval(const int vflag,
const flt_t vb2zm = x[i2].z - x[i3].z; const flt_t vb2zm = x[i2].z - x[i3].z;
// 3rd bond // 3rd bond
const flt_t vb3x = x[i4].x - x[i3].x; const flt_t vb3x = x[i4].x - x[i3].x;
const flt_t vb3y = x[i4].y - x[i3].y; const flt_t vb3y = x[i4].y - x[i3].y;
const flt_t vb3z = x[i4].z - x[i3].z; const flt_t vb3z = x[i4].z - x[i3].z;
@ -244,25 +244,25 @@ void DihedralCharmmIntel::eval(const int vflag,
// error check // error check
#ifndef LMP_SIMD_COMPILER_TEST #ifndef LMP_SIMD_COMPILER_TEST
if (c > PTOLERANCE || c < MTOLERANCE) { if (c > PTOLERANCE || c < MTOLERANCE) {
int me = comm->me; int me = comm->me;
if (screen) { if (screen) {
char str[128]; char str[128];
sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " " sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT, TAGINT_FORMAT " " TAGINT_FORMAT,
me,tid,update->ntimestep, me,tid,update->ntimestep,
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
error->warning(FLERR,str,0); error->warning(FLERR,str,0);
fprintf(screen," 1st atom: %d %g %g %g\n", fprintf(screen," 1st atom: %d %g %g %g\n",
me,x[i1].x,x[i1].y,x[i1].z); me,x[i1].x,x[i1].y,x[i1].z);
fprintf(screen," 2nd atom: %d %g %g %g\n", fprintf(screen," 2nd atom: %d %g %g %g\n",
me,x[i2].x,x[i2].y,x[i2].z); me,x[i2].x,x[i2].y,x[i2].z);
fprintf(screen," 3rd atom: %d %g %g %g\n", fprintf(screen," 3rd atom: %d %g %g %g\n",
me,x[i3].x,x[i3].y,x[i3].z); me,x[i3].x,x[i3].y,x[i3].z);
fprintf(screen," 4th atom: %d %g %g %g\n", fprintf(screen," 4th atom: %d %g %g %g\n",
me,x[i4].x,x[i4].y,x[i4].z); me,x[i4].x,x[i4].y,x[i4].z);
} }
} }
#endif #endif
@ -279,19 +279,19 @@ void DihedralCharmmIntel::eval(const int vflag,
ddf1 = df1 = (flt_t)0.0; ddf1 = df1 = (flt_t)0.0;
for (int i = 0; i < m; i++) { for (int i = 0; i < m; i++) {
ddf1 = p*c - df1*s; ddf1 = p*c - df1*s;
df1 = p*s + df1*c; df1 = p*s + df1*c;
p = ddf1; p = ddf1;
} }
p = p*tcos_shift + df1*tsin_shift; p = p*tcos_shift + df1*tsin_shift;
df1 = df1*tcos_shift - ddf1*tsin_shift; df1 = df1*tcos_shift - ddf1*tsin_shift;
df1 *= -m; df1 *= -m;
p += (flt_t)1.0; p += (flt_t)1.0;
if (m == 0) { if (m == 0) {
p = (flt_t)1.0 + tcos_shift; p = (flt_t)1.0 + tcos_shift;
df1 = (flt_t)0.0; df1 = (flt_t)0.0;
} }
const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm; const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
@ -334,12 +334,12 @@ void DihedralCharmmIntel::eval(const int vflag,
const flt_t f3z = -sz2 - f4z; const flt_t f3z = -sz2 - f4z;
if (EFLAG || VFLAG) { if (EFLAG || VFLAG) {
flt_t deng; flt_t deng;
if (EFLAG) deng = tk * p; if (EFLAG) deng = tk * p;
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3,
i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
vb3x, vb3y, vb3z, sedihedral, f, NEWTON_BOND, vb3x, vb3y, vb3z, sedihedral, f, NEWTON_BOND,
nlocal, sv0, sv1, sv2, sv3, sv4, sv5); nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
} }
@ -349,15 +349,15 @@ void DihedralCharmmIntel::eval(const int vflag,
#endif #endif
{ {
if (NEWTON_BOND || i2 < nlocal) { if (NEWTON_BOND || i2 < nlocal) {
f[i2].x += f2x; f[i2].x += f2x;
f[i2].y += f2y; f[i2].y += f2y;
f[i2].z += f2z; f[i2].z += f2z;
} }
if (NEWTON_BOND || i3 < nlocal) { if (NEWTON_BOND || i3 < nlocal) {
f[i3].x += f3x; f[i3].x += f3x;
f[i3].y += f3y; f[i3].y += f3y;
f[i3].z += f3z; f[i3].z += f3z;
} }
} }
@ -372,54 +372,54 @@ void DihedralCharmmIntel::eval(const int vflag,
flt_t forcecoul; flt_t forcecoul;
if (implicit) forcecoul = qqrd2e * q[i1]*q[i4]*r2inv; if (implicit) forcecoul = qqrd2e * q[i1]*q[i4]*r2inv;
else forcecoul = qqrd2e * q[i1]*q[i4]*sqrt(r2inv); else forcecoul = qqrd2e * q[i1]*q[i4]*sqrt(r2inv);
const flt_t forcelj = r6inv * (fc.ljp[itype][jtype].lj1*r6inv - const flt_t forcelj = r6inv * (fc.ljp[itype][jtype].lj1*r6inv -
fc.ljp[itype][jtype].lj2); fc.ljp[itype][jtype].lj2);
const flt_t fpair = tweight * (forcelj+forcecoul)*r2inv; const flt_t fpair = tweight * (forcelj+forcecoul)*r2inv;
if (NEWTON_BOND || i1 < nlocal) { if (NEWTON_BOND || i1 < nlocal) {
f1x += delx*fpair; f1x += delx*fpair;
f1y += dely*fpair; f1y += dely*fpair;
f1z += delz*fpair; f1z += delz*fpair;
} }
if (NEWTON_BOND || i4 < nlocal) { if (NEWTON_BOND || i4 < nlocal) {
f4x -= delx*fpair; f4x -= delx*fpair;
f4y -= dely*fpair; f4y -= dely*fpair;
f4z -= delz*fpair; f4z -= delz*fpair;
} }
if (EFLAG || VFLAG) { if (EFLAG || VFLAG) {
flt_t ev_pre = (flt_t)0; flt_t ev_pre = (flt_t)0;
if (NEWTON_BOND || i1 < nlocal) if (NEWTON_BOND || i1 < nlocal)
ev_pre += (flt_t)0.5; ev_pre += (flt_t)0.5;
if (NEWTON_BOND || i4 < nlocal) if (NEWTON_BOND || i4 < nlocal)
ev_pre += (flt_t)0.5; ev_pre += (flt_t)0.5;
if (EFLAG) { if (EFLAG) {
flt_t ecoul, evdwl; flt_t ecoul, evdwl;
ecoul = tweight * forcecoul; ecoul = tweight * forcecoul;
evdwl = tweight * r6inv * (fc.ljp[itype][jtype].lj3*r6inv - evdwl = tweight * r6inv * (fc.ljp[itype][jtype].lj3*r6inv -
fc.ljp[itype][jtype].lj4); fc.ljp[itype][jtype].lj4);
secoul += ev_pre * ecoul; secoul += ev_pre * ecoul;
sevdwl += ev_pre * evdwl; sevdwl += ev_pre * evdwl;
if (eatom) { if (eatom) {
evdwl *= (flt_t)0.5; evdwl *= (flt_t)0.5;
evdwl += (flt_t)0.5 * ecoul; evdwl += (flt_t)0.5 * ecoul;
if (NEWTON_BOND || i1 < nlocal) if (NEWTON_BOND || i1 < nlocal)
f[i1].w += evdwl; f[i1].w += evdwl;
if (NEWTON_BOND || i4 < nlocal) if (NEWTON_BOND || i4 < nlocal)
f[i4].w += evdwl; f[i4].w += evdwl;
} }
} }
// IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, // IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
// delx, dely, delz); // delx, dely, delz);
if (VFLAG && vflag) { if (VFLAG && vflag) {
spv0 += ev_pre * delx * delx * fpair; spv0 += ev_pre * delx * delx * fpair;
spv1 += ev_pre * dely * dely * fpair; spv1 += ev_pre * dely * dely * fpair;
spv2 += ev_pre * delz * delz * fpair; spv2 += ev_pre * delz * delz * fpair;
spv3 += ev_pre * delx * dely * fpair; spv3 += ev_pre * delx * dely * fpair;
spv4 += ev_pre * delx * delz * fpair; spv4 += ev_pre * delx * delz * fpair;
spv5 += ev_pre * dely * delz * fpair; spv5 += ev_pre * dely * delz * fpair;
} }
} }
// apply force to each of 4 atoms // apply force to each of 4 atoms
@ -428,15 +428,15 @@ void DihedralCharmmIntel::eval(const int vflag,
#endif #endif
{ {
if (NEWTON_BOND || i1 < nlocal) { if (NEWTON_BOND || i1 < nlocal) {
f[i1].x += f1x; f[i1].x += f1x;
f[i1].y += f1y; f[i1].y += f1y;
f[i1].z += f1z; f[i1].z += f1z;
} }
if (NEWTON_BOND || i4 < nlocal) { if (NEWTON_BOND || i4 < nlocal) {
f[i4].x += f4x; f[i4].x += f4x;
f[i4].y += f4y; f[i4].y += f4y;
f[i4].z += f4z; f[i4].z += f4z;
} }
} }
} // for n } // for n
@ -447,7 +447,7 @@ void DihedralCharmmIntel::eval(const int vflag,
} }
if (VFLAG && vflag) { if (VFLAG && vflag) {
ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5; ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
opv0 += spv0; opv1 += spv1; opv2 += spv2; opv0 += spv0; opv1 += spv1; opv2 += spv2;
opv3 += spv3; opv4 += spv4; opv5 += spv5; opv3 += spv3; opv4 += spv4; opv5 += spv5;
} }
} // omp parallel } // omp parallel
@ -485,9 +485,9 @@ authors for more details.
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void DihedralCharmmIntel::eval(const int vflag, void DihedralCharmmIntel::eval(const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t; typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t;
@ -522,20 +522,20 @@ void DihedralCharmmIntel::eval(const int vflag,
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp parallel default(none) \ #pragma omp parallel default(none) \
shared(f_start,f_stride,fc) \ shared(f_start,f_stride,fc) \
reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \ reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
opv0,opv1,opv2,opv3,opv4,opv5) opv0,opv1,opv2,opv3,opv4,opv5)
#endif #endif
{ {
int nfrom, npl, nto, tid; int nfrom, npl, nto, tid;
IP_PRE_omp_stride_id_vec(nfrom, npl, nto, tid, inum, nthreads, IP_PRE_omp_stride_id_vec(nfrom, npl, nto, tid, inum, nthreads,
swidth); swidth);
FORCE_T * _noalias const f = f_start + (tid * f_stride); FORCE_T * _noalias const f = f_start + (tid * f_stride);
if (fix->need_zero(tid)) if (fix->need_zero(tid))
memset(f, 0, f_stride * sizeof(FORCE_T)); memset(f, 0, f_stride * sizeof(FORCE_T));
const int * _noalias const dihedrallist = const int * _noalias const dihedrallist =
(int *) neighbor->dihedrallist[0]; (int *) neighbor->dihedrallist[0];
const flt_t * _noalias const weight = &(fc.weight[0]); const flt_t * _noalias const weight = &(fc.weight[0]);
const flt_t * _noalias const x_f = &(x[0].x); const flt_t * _noalias const x_f = &(x[0].x);
@ -574,7 +574,7 @@ void DihedralCharmmIntel::eval(const int vflag,
} }
SIMD_int n_offset = SIMD_set(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, SIMD_int n_offset = SIMD_set(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50,
55, 60, 65, 70, 75) + (nfrom * 5); 55, 60, 65, 70, 75) + (nfrom * 5);
const int nto5 = nto * 5; const int nto5 = nto * 5;
const int nlocals4 = nlocal << 4; const int nlocals4 = nlocal << 4;
const SIMD_int simd_nlocals4 = SIMD_set(nlocals4); const SIMD_int simd_nlocals4 = SIMD_set(nlocals4);
@ -618,7 +618,7 @@ void DihedralCharmmIntel::eval(const int vflag,
const SIMD_flt_t vb2zm = z2 - z3; const SIMD_flt_t vb2zm = z2 - z3;
// 3rd bond // 3rd bond
SIMD_flt_t x4, y4, z4; SIMD_flt_t x4, y4, z4;
SIMD_int jtype; SIMD_int jtype;
@ -664,7 +664,7 @@ void DihedralCharmmIntel::eval(const int vflag,
const SIMD_flt_t ptol = SIMD_set(PTOLERANCE); const SIMD_flt_t ptol = SIMD_set(PTOLERANCE);
const SIMD_flt_t ntol = SIMD_set(MTOLERANCE); const SIMD_flt_t ntol = SIMD_set(MTOLERANCE);
if (c > ptol || c < ntol) if (c > ptol || c < ntol)
if (screen) if (screen)
error->warning(FLERR,"Dihedral problem."); error->warning(FLERR,"Dihedral problem.");
c = SIMD_set(c, c > one, one); c = SIMD_set(c, c > one, one);
@ -678,14 +678,14 @@ void DihedralCharmmIntel::eval(const int vflag,
SIMD_flt_t p(one); SIMD_flt_t p(one);
SIMD_flt_t ddf1(szero); SIMD_flt_t ddf1(szero);
SIMD_flt_t df1(szero); SIMD_flt_t df1(szero);
const int m_max = SIMD_max(m); const int m_max = SIMD_max(m);
for (int i = 0; i < m_max; i++) { for (int i = 0; i < m_max; i++) {
const SIMD_mask my_m = i < m; const SIMD_mask my_m = i < m;
ddf1 = SIMD_set(ddf1, my_m, p*c - df1*s); ddf1 = SIMD_set(ddf1, my_m, p*c - df1*s);
df1 = SIMD_set(df1, my_m, p*s + df1*c); df1 = SIMD_set(df1, my_m, p*s + df1*c);
p = SIMD_set(p, my_m, ddf1); p = SIMD_set(p, my_m, ddf1);
} }
SIMD_flt_t multf; SIMD_flt_t multf;
@ -694,7 +694,7 @@ void DihedralCharmmIntel::eval(const int vflag,
df1 = df1*tcos_shift - ddf1*tsin_shift; df1 = df1*tcos_shift - ddf1*tsin_shift;
df1 = df1 * multf; df1 = df1 * multf;
p = p + one; p = p + one;
SIMD_mask mzero = (m == SIMD_set((int)0)); SIMD_mask mzero = (m == SIMD_set((int)0));
p = SIMD_set(p, mzero, one + tcos_shift); p = SIMD_set(p, mzero, one + tcos_shift);
df1 = SIMD_set(df1, mzero, szero); df1 = SIMD_set(df1, mzero, szero);
@ -740,40 +740,40 @@ void DihedralCharmmIntel::eval(const int vflag,
SIMD_flt_t qdeng; SIMD_flt_t qdeng;
if (EFLAG || VFLAG) { if (EFLAG || VFLAG) {
SIMD_flt_t ev_pre; SIMD_flt_t ev_pre;
if (NEWTON_BOND) ev_pre = one; if (NEWTON_BOND) ev_pre = one;
else { else {
ev_pre = szero; ev_pre = szero;
const SIMD_flt_t quarter = SIMD_set((flt_t)0.25); const SIMD_flt_t quarter = SIMD_set((flt_t)0.25);
ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4, ev_pre, quarter); ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4, ev_pre, quarter);
ev_pre = SIMD_add(ev_pre, i2 < simd_nlocals4, ev_pre, quarter); ev_pre = SIMD_add(ev_pre, i2 < simd_nlocals4, ev_pre, quarter);
ev_pre = SIMD_add(ev_pre, i3 < simd_nlocals4, ev_pre, quarter); ev_pre = SIMD_add(ev_pre, i3 < simd_nlocals4, ev_pre, quarter);
ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4, ev_pre, quarter); ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4, ev_pre, quarter);
} }
SIMD_zero_masked(nmask, ev_pre); SIMD_zero_masked(nmask, ev_pre);
if (EFLAG) { if (EFLAG) {
const SIMD_flt_t deng = tk * p; const SIMD_flt_t deng = tk * p;
sedihedral = SIMD_ev_add(sedihedral, ev_pre * deng); sedihedral = SIMD_ev_add(sedihedral, ev_pre * deng);
if (eatom) { if (eatom) {
qdeng = deng * SIMD_set((flt_t)0.25); qdeng = deng * SIMD_set((flt_t)0.25);
SIMD_mask newton_mask; SIMD_mask newton_mask;
if (NEWTON_BOND) newton_mask = nmask; if (NEWTON_BOND) newton_mask = nmask;
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4); if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4);
SIMD_flt_t ieng = qdeng; SIMD_flt_t ieng = qdeng;
SIMD_jeng_update(newton_mask, featom, i2, ieng); SIMD_jeng_update(newton_mask, featom, i2, ieng);
ieng = qdeng; ieng = qdeng;
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4); if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4);
SIMD_jeng_update(newton_mask, featom, i3, ieng); SIMD_jeng_update(newton_mask, featom, i3, ieng);
} }
} }
if (VFLAG && vflag) { if (VFLAG && vflag) {
sv0 = SIMD_ev_add(sv0, ev_pre*(vb1x*f1x-vb2xm*f3x+(vb3x-vb2xm)*f4x)); sv0 = SIMD_ev_add(sv0, ev_pre*(vb1x*f1x-vb2xm*f3x+(vb3x-vb2xm)*f4x));
sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y)); sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y));
sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z)); sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z));
sv3 = SIMD_ev_add(sv3, ev_pre*(vb1x*f1y-vb2xm*f3y+(vb3x-vb2xm)*f4y)); sv3 = SIMD_ev_add(sv3, ev_pre*(vb1x*f1y-vb2xm*f3y+(vb3x-vb2xm)*f4y));
sv4 = SIMD_ev_add(sv4, ev_pre*(vb1x*f1z-vb2xm*f3z+(vb3x-vb2xm)*f4z)); sv4 = SIMD_ev_add(sv4, ev_pre*(vb1x*f1z-vb2xm*f3z+(vb3x-vb2xm)*f4z));
sv5 = SIMD_ev_add(sv5, ev_pre*(vb1y*f1z-vb2ym*f3z+(vb3y-vb2ym)*f4z)); sv5 = SIMD_ev_add(sv5, ev_pre*(vb1y*f1z-vb2ym*f3z+(vb3y-vb2ym)*f4z));
} }
} }
SIMD_mask newton_mask; SIMD_mask newton_mask;
@ -809,27 +809,27 @@ void DihedralCharmmIntel::eval(const int vflag,
f4z = f4z - delz * fpair; f4z = f4z - delz * fpair;
if (EFLAG || VFLAG) { if (EFLAG || VFLAG) {
SIMD_flt_t ev_pre; SIMD_flt_t ev_pre;
if (NEWTON_BOND) ev_pre = one; if (NEWTON_BOND) ev_pre = one;
else { else {
ev_pre = szero; ev_pre = szero;
const SIMD_flt_t half = SIMD_set((flt_t)0.5); const SIMD_flt_t half = SIMD_set((flt_t)0.5);
ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4,ev_pre,half); ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4,ev_pre,half);
ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4,ev_pre,half); ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4,ev_pre,half);
} }
SIMD_zero_masked(nmask, ev_pre); SIMD_zero_masked(nmask, ev_pre);
if (EFLAG) { if (EFLAG) {
const SIMD_flt_t ecoul = tweight * forcecoul; const SIMD_flt_t ecoul = tweight * forcecoul;
const SIMD_flt_t lj3 = SIMD_gather(nmask, plj3, ijtype); const SIMD_flt_t lj3 = SIMD_gather(nmask, plj3, ijtype);
const SIMD_flt_t lj4 = SIMD_gather(nmask, plj4, ijtype); const SIMD_flt_t lj4 = SIMD_gather(nmask, plj4, ijtype);
SIMD_flt_t evdwl = tweight * r6inv * (lj3 * r6inv - lj4); SIMD_flt_t evdwl = tweight * r6inv * (lj3 * r6inv - lj4);
secoul = SIMD_ev_add(secoul, ev_pre * ecoul); secoul = SIMD_ev_add(secoul, ev_pre * ecoul);
sevdwl = SIMD_ev_add(sevdwl, ev_pre * evdwl); sevdwl = SIMD_ev_add(sevdwl, ev_pre * evdwl);
if (eatom) { if (eatom) {
const SIMD_flt_t half = SIMD_set((flt_t)0.5); const SIMD_flt_t half = SIMD_set((flt_t)0.5);
evdwl = evdwl * half; evdwl = evdwl * half;
evdwl = evdwl + half * ecoul + qdeng; evdwl = evdwl + half * ecoul + qdeng;
if (NEWTON_BOND) newton_mask = nmask; if (NEWTON_BOND) newton_mask = nmask;
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i1, simd_nlocals4); if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i1, simd_nlocals4);
@ -838,16 +838,16 @@ void DihedralCharmmIntel::eval(const int vflag,
ieng = evdwl; ieng = evdwl;
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i4, simd_nlocals4); if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i4, simd_nlocals4);
SIMD_jeng_update(newton_mask, featom, i4, ieng); SIMD_jeng_update(newton_mask, featom, i4, ieng);
} }
} }
if (VFLAG && vflag) { if (VFLAG && vflag) {
spv0 = SIMD_ev_add(spv0, ev_pre * delx * delx * fpair); spv0 = SIMD_ev_add(spv0, ev_pre * delx * delx * fpair);
spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair); spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair);
spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair); spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair);
spv3 = SIMD_ev_add(spv3, ev_pre * delx * dely * fpair); spv3 = SIMD_ev_add(spv3, ev_pre * delx * dely * fpair);
spv4 = SIMD_ev_add(spv4, ev_pre * delx * delz * fpair); spv4 = SIMD_ev_add(spv4, ev_pre * delx * delz * fpair);
spv5 = SIMD_ev_add(spv5, ev_pre * dely * delz * fpair); spv5 = SIMD_ev_add(spv5, ev_pre * dely * delz * fpair);
} }
} }
if (NEWTON_BOND) newton_mask = nmask; if (NEWTON_BOND) newton_mask = nmask;
@ -863,17 +863,17 @@ void DihedralCharmmIntel::eval(const int vflag,
oevdwl += SIMD_sum(sevdwl); oevdwl += SIMD_sum(sevdwl);
} }
if (VFLAG && vflag) { if (VFLAG && vflag) {
ov0 += SIMD_sum(sv0); ov0 += SIMD_sum(sv0);
ov1 += SIMD_sum(sv1); ov1 += SIMD_sum(sv1);
ov2 += SIMD_sum(sv2); ov2 += SIMD_sum(sv2);
ov3 += SIMD_sum(sv3); ov3 += SIMD_sum(sv3);
ov4 += SIMD_sum(sv4); ov4 += SIMD_sum(sv4);
ov5 += SIMD_sum(sv5); ov5 += SIMD_sum(sv5);
opv0 += SIMD_sum(spv0); opv0 += SIMD_sum(spv0);
opv1 += SIMD_sum(spv1); opv1 += SIMD_sum(spv1);
opv2 += SIMD_sum(spv2); opv2 += SIMD_sum(spv2);
opv3 += SIMD_sum(spv3); opv3 += SIMD_sum(spv3);
opv4 += SIMD_sum(spv4); opv4 += SIMD_sum(spv4);
opv5 += SIMD_sum(spv5); opv5 += SIMD_sum(spv5);
} }
} // omp parallel } // omp parallel
@ -933,7 +933,7 @@ void DihedralCharmmIntel::init_style()
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc, void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers) IntelBuffers<flt_t,acc_t> *buffers)
{ {
const int tp1 = atom->ntypes + 1; const int tp1 = atom->ntypes + 1;
@ -944,10 +944,10 @@ void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
if (weightflag) { if (weightflag) {
for (int i = 0; i < tp1; i++) { for (int i = 0; i < tp1; i++) {
for (int j = 0; j < tp1; j++) { for (int j = 0; j < tp1; j++) {
fc.ljp[i][j].lj1 = lj14_1[i][j]; fc.ljp[i][j].lj1 = lj14_1[i][j];
fc.ljp[i][j].lj2 = lj14_2[i][j]; fc.ljp[i][j].lj2 = lj14_2[i][j];
fc.ljp[i][j].lj3 = lj14_3[i][j]; fc.ljp[i][j].lj3 = lj14_3[i][j];
fc.ljp[i][j].lj4 = lj14_4[i][j]; fc.ljp[i][j].lj4 = lj14_4[i][j];
} }
} }
} }
@ -965,8 +965,8 @@ void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
template <class flt_t> template <class flt_t>
void DihedralCharmmIntel::ForceConst<flt_t>::set_ntypes(const int npairtypes, void DihedralCharmmIntel::ForceConst<flt_t>::set_ntypes(const int npairtypes,
const int nbondtypes, const int nbondtypes,
Memory *memory) { Memory *memory) {
if (npairtypes != _npairtypes) { if (npairtypes != _npairtypes) {
if (_npairtypes > 0) if (_npairtypes > 0)
_memory->destroy(ljp); _memory->destroy(ljp);
@ -979,7 +979,7 @@ void DihedralCharmmIntel::ForceConst<flt_t>::set_ntypes(const int npairtypes,
_memory->destroy(bp); _memory->destroy(bp);
_memory->destroy(weight); _memory->destroy(weight);
} }
if (nbondtypes > 0) { if (nbondtypes > 0) {
_memory->create(bp,nbondtypes,"dihedralcharmmintel.bp"); _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
_memory->create(weight,nbondtypes,"dihedralcharmmintel.weight"); _memory->create(weight,nbondtypes,"dihedralcharmmintel.weight");

View File

@ -44,8 +44,8 @@ class DihedralCharmmIntel : public DihedralCharmm {
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc, void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers); IntelBuffers<flt_t, acc_t> *buffers);
@ -58,7 +58,7 @@ class DihedralCharmmIntel : public DihedralCharmm {
class ForceConst { class ForceConst {
public: public:
typedef struct { flt_t lj1, lj2, lj3, lj4; } fc_packed1; typedef struct { flt_t lj1, lj2, lj3, lj4; } fc_packed1;
typedef struct { flt_t cos_shift, sin_shift, k; typedef struct { flt_t cos_shift, sin_shift, k;
int multiplicity; } fc_packed3; int multiplicity; } fc_packed3;
fc_packed1 **ljp; fc_packed1 **ljp;

View File

@ -69,8 +69,8 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag)
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void DihedralHarmonicIntel::compute(int eflag, int vflag, void DihedralHarmonicIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
if (eflag || vflag) { if (eflag || vflag) {
ev_setup(eflag,vflag); ev_setup(eflag,vflag);
@ -79,14 +79,14 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag,
if (evflag) { if (evflag) {
if (vflag && !eflag) { if (vflag && !eflag) {
if (force->newton_bond) if (force->newton_bond)
eval<0,1,1>(vflag, buffers, fc); eval<0,1,1>(vflag, buffers, fc);
else else
eval<0,1,0>(vflag, buffers, fc); eval<0,1,0>(vflag, buffers, fc);
} else { } else {
if (force->newton_bond) if (force->newton_bond)
eval<1,1,1>(vflag, buffers, fc); eval<1,1,1>(vflag, buffers, fc);
else else
eval<1,1,0>(vflag, buffers, fc); eval<1,1,0>(vflag, buffers, fc);
} }
} else { } else {
if (force->newton_bond) if (force->newton_bond)
@ -97,9 +97,9 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag,
} }
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void DihedralHarmonicIntel::eval(const int vflag, void DihedralHarmonicIntel::eval(const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
const int inum = neighbor->ndihedrallist; const int inum = neighbor->ndihedrallist;
@ -127,7 +127,7 @@ void DihedralHarmonicIntel::eval(const int vflag,
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp parallel default(none) \ #pragma omp parallel default(none) \
shared(f_start,f_stride,fc) \ shared(f_start,f_stride,fc) \
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5) reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
#endif #endif
{ {
@ -142,7 +142,7 @@ void DihedralHarmonicIntel::eval(const int vflag,
if (fix->need_zero(tid)) if (fix->need_zero(tid))
memset(f, 0, f_stride * sizeof(FORCE_T)); memset(f, 0, f_stride * sizeof(FORCE_T));
const int5_t * _noalias const dihedrallist = const int5_t * _noalias const dihedrallist =
(int5_t *) neighbor->dihedrallist[0]; (int5_t *) neighbor->dihedrallist[0];
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
@ -175,7 +175,7 @@ void DihedralHarmonicIntel::eval(const int vflag,
const flt_t vb2zm = x[i2].z - x[i3].z; const flt_t vb2zm = x[i2].z - x[i3].z;
// 3rd bond // 3rd bond
const flt_t vb3x = x[i4].x - x[i3].x; const flt_t vb3x = x[i4].x - x[i3].x;
const flt_t vb3y = x[i4].y - x[i3].y; const flt_t vb3y = x[i4].y - x[i3].y;
const flt_t vb3z = x[i4].z - x[i3].z; const flt_t vb3z = x[i4].z - x[i3].z;
@ -207,25 +207,25 @@ void DihedralHarmonicIntel::eval(const int vflag,
// error check // error check
#ifndef LMP_INTEL_USE_SIMDOFF #ifndef LMP_INTEL_USE_SIMDOFF
if (c > PTOLERANCE || c < MTOLERANCE) { if (c > PTOLERANCE || c < MTOLERANCE) {
int me = comm->me; int me = comm->me;
if (screen) { if (screen) {
char str[128]; char str[128];
sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " " sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT, TAGINT_FORMAT " " TAGINT_FORMAT,
me,tid,update->ntimestep, me,tid,update->ntimestep,
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
error->warning(FLERR,str,0); error->warning(FLERR,str,0);
fprintf(screen," 1st atom: %d %g %g %g\n", fprintf(screen," 1st atom: %d %g %g %g\n",
me,x[i1].x,x[i1].y,x[i1].z); me,x[i1].x,x[i1].y,x[i1].z);
fprintf(screen," 2nd atom: %d %g %g %g\n", fprintf(screen," 2nd atom: %d %g %g %g\n",
me,x[i2].x,x[i2].y,x[i2].z); me,x[i2].x,x[i2].y,x[i2].z);
fprintf(screen," 3rd atom: %d %g %g %g\n", fprintf(screen," 3rd atom: %d %g %g %g\n",
me,x[i3].x,x[i3].y,x[i3].z); me,x[i3].x,x[i3].y,x[i3].z);
fprintf(screen," 4th atom: %d %g %g %g\n", fprintf(screen," 4th atom: %d %g %g %g\n",
me,x[i4].x,x[i4].y,x[i4].z); me,x[i4].x,x[i4].y,x[i4].z);
} }
} }
#endif #endif
@ -242,19 +242,19 @@ void DihedralHarmonicIntel::eval(const int vflag,
ddf1 = df1 = (flt_t)0.0; ddf1 = df1 = (flt_t)0.0;
for (int i = 0; i < m; i++) { for (int i = 0; i < m; i++) {
ddf1 = p*c - df1*s; ddf1 = p*c - df1*s;
df1 = p*s + df1*c; df1 = p*s + df1*c;
p = ddf1; p = ddf1;
} }
p = p*tcos_shift + df1*tsin_shift; p = p*tcos_shift + df1*tsin_shift;
df1 = df1*tcos_shift - ddf1*tsin_shift; df1 = df1*tcos_shift - ddf1*tsin_shift;
df1 *= -m; df1 *= -m;
p += (flt_t)1.0; p += (flt_t)1.0;
if (m == 0) { if (m == 0) {
p = (flt_t)1.0 + tcos_shift; p = (flt_t)1.0 + tcos_shift;
df1 = (flt_t)0.0; df1 = (flt_t)0.0;
} }
const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm; const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
@ -297,20 +297,20 @@ void DihedralHarmonicIntel::eval(const int vflag,
const flt_t f3z = -sz2 - f4z; const flt_t f3z = -sz2 - f4z;
if (EFLAG || VFLAG) { if (EFLAG || VFLAG) {
flt_t deng; flt_t deng;
if (EFLAG) deng = tk * p; if (EFLAG) deng = tk * p;
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4, IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal, vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
sv0, sv1, sv2, sv3, sv4, sv5); sv0, sv1, sv2, sv3, sv4, sv5);
#else #else
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4, IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal, vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
ov0, ov1, ov2, ov3, ov4, ov5); ov0, ov1, ov2, ov3, ov4, ov5);
#endif #endif
} }
@ -319,35 +319,35 @@ void DihedralHarmonicIntel::eval(const int vflag,
#endif #endif
{ {
if (NEWTON_BOND || i1 < nlocal) { if (NEWTON_BOND || i1 < nlocal) {
f[i1].x += f1x; f[i1].x += f1x;
f[i1].y += f1y; f[i1].y += f1y;
f[i1].z += f1z; f[i1].z += f1z;
} }
if (NEWTON_BOND || i2 < nlocal) { if (NEWTON_BOND || i2 < nlocal) {
f[i2].x += f2x; f[i2].x += f2x;
f[i2].y += f2y; f[i2].y += f2y;
f[i2].z += f2z; f[i2].z += f2z;
} }
if (NEWTON_BOND || i3 < nlocal) { if (NEWTON_BOND || i3 < nlocal) {
f[i3].x += f3x; f[i3].x += f3x;
f[i3].y += f3y; f[i3].y += f3y;
f[i3].z += f3z; f[i3].z += f3z;
} }
if (NEWTON_BOND || i4 < nlocal) { if (NEWTON_BOND || i4 < nlocal) {
f[i4].x += f4x; f[i4].x += f4x;
f[i4].y += f4y; f[i4].y += f4y;
f[i4].z += f4z; f[i4].z += f4z;
} }
} }
} // for n } // for n
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
if (EFLAG) oedihedral += sedihedral; if (EFLAG) oedihedral += sedihedral;
if (VFLAG && vflag) { if (VFLAG && vflag) {
ov0 += sv0; ov1 += sv1; ov2 += sv2; ov0 += sv0; ov1 += sv1; ov2 += sv2;
ov3 += sv3; ov4 += sv4; ov5 += sv5; ov3 += sv3; ov4 += sv4; ov5 += sv5;
} }
#endif #endif
} // omp parallel } // omp parallel
@ -395,7 +395,7 @@ void DihedralHarmonicIntel::init_style()
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void DihedralHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc, void DihedralHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers) IntelBuffers<flt_t,acc_t> *buffers)
{ {
const int bp1 = atom->ndihedraltypes + 1; const int bp1 = atom->ndihedraltypes + 1;
fc.set_ntypes(bp1,memory); fc.set_ntypes(bp1,memory);
@ -412,11 +412,11 @@ void DihedralHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
template <class flt_t> template <class flt_t>
void DihedralHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes, void DihedralHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
Memory *memory) { Memory *memory) {
if (nbondtypes != _nbondtypes) { if (nbondtypes != _nbondtypes) {
if (_nbondtypes > 0) if (_nbondtypes > 0)
_memory->destroy(bp); _memory->destroy(bp);
if (nbondtypes > 0) if (nbondtypes > 0)
_memory->create(bp,nbondtypes,"dihedralcharmmintel.bp"); _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
} }

View File

@ -44,8 +44,8 @@ class DihedralHarmonicIntel : public DihedralHarmonic {
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc, void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers); IntelBuffers<flt_t, acc_t> *buffers);
@ -57,7 +57,7 @@ class DihedralHarmonicIntel : public DihedralHarmonic {
template <class flt_t> template <class flt_t>
class ForceConst { class ForceConst {
public: public:
typedef struct { flt_t cos_shift, sin_shift, k; typedef struct { flt_t cos_shift, sin_shift, k;
int multiplicity; } fc_packed1; int multiplicity; } fc_packed1;
fc_packed1 *bp; fc_packed1 *bp;

View File

@ -73,8 +73,8 @@ void DihedralOPLSIntel::compute(int eflag, int vflag)
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void DihedralOPLSIntel::compute(int eflag, int vflag, void DihedralOPLSIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
if (eflag || vflag) { if (eflag || vflag) {
ev_setup(eflag,vflag); ev_setup(eflag,vflag);
@ -83,14 +83,14 @@ void DihedralOPLSIntel::compute(int eflag, int vflag,
if (evflag) { if (evflag) {
if (vflag && !eflag) { if (vflag && !eflag) {
if (force->newton_bond) if (force->newton_bond)
eval<0,1,1>(vflag, buffers, fc); eval<0,1,1>(vflag, buffers, fc);
else else
eval<0,1,0>(vflag, buffers, fc); eval<0,1,0>(vflag, buffers, fc);
} else { } else {
if (force->newton_bond) if (force->newton_bond)
eval<1,1,1>(vflag, buffers, fc); eval<1,1,1>(vflag, buffers, fc);
else else
eval<1,1,0>(vflag, buffers, fc); eval<1,1,0>(vflag, buffers, fc);
} }
} else { } else {
if (force->newton_bond) if (force->newton_bond)
@ -101,9 +101,9 @@ void DihedralOPLSIntel::compute(int eflag, int vflag,
} }
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void DihedralOPLSIntel::eval(const int vflag, void DihedralOPLSIntel::eval(const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
const int inum = neighbor->ndihedrallist; const int inum = neighbor->ndihedrallist;
@ -131,7 +131,7 @@ void DihedralOPLSIntel::eval(const int vflag,
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp parallel default(none) \ #pragma omp parallel default(none) \
shared(f_start,f_stride,fc) \ shared(f_start,f_stride,fc) \
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5) reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
#endif #endif
{ {
@ -146,7 +146,7 @@ void DihedralOPLSIntel::eval(const int vflag,
if (fix->need_zero(tid)) if (fix->need_zero(tid))
memset(f, 0, f_stride * sizeof(FORCE_T)); memset(f, 0, f_stride * sizeof(FORCE_T));
const int5_t * _noalias const dihedrallist = const int5_t * _noalias const dihedrallist =
(int5_t *) neighbor->dihedrallist[0]; (int5_t *) neighbor->dihedrallist[0];
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
@ -179,7 +179,7 @@ void DihedralOPLSIntel::eval(const int vflag,
const flt_t vb2zm = x[i2].z - x[i3].z; const flt_t vb2zm = x[i2].z - x[i3].z;
// 3rd bond // 3rd bond
const flt_t vb3x = x[i4].x - x[i3].x; const flt_t vb3x = x[i4].x - x[i3].x;
const flt_t vb3y = x[i4].y - x[i3].y; const flt_t vb3y = x[i4].y - x[i3].y;
const flt_t vb3z = x[i4].z - x[i3].z; const flt_t vb3z = x[i4].z - x[i3].z;
@ -209,7 +209,7 @@ void DihedralOPLSIntel::eval(const int vflag,
const flt_t c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3; const flt_t c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
flt_t ctmp = -vb1x*vb2xm - vb1y*vb2ym - vb1z*vb2zm; flt_t ctmp = -vb1x*vb2xm - vb1y*vb2ym - vb1z*vb2zm;
const flt_t r12c1 = rb1 * rb2; const flt_t r12c1 = rb1 * rb2;
const flt_t c1mag = ctmp * r12c1; const flt_t c1mag = ctmp * r12c1;
ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z; ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
@ -240,25 +240,25 @@ void DihedralOPLSIntel::eval(const int vflag,
// error check // error check
#ifndef LMP_INTEL_USE_SIMDOFF #ifndef LMP_INTEL_USE_SIMDOFF
if (c > PTOLERANCE || c < MTOLERANCE) { if (c > PTOLERANCE || c < MTOLERANCE) {
int me = comm->me; int me = comm->me;
if (screen) { if (screen) {
char str[128]; char str[128];
sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " " sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT, TAGINT_FORMAT " " TAGINT_FORMAT,
me,tid,update->ntimestep, me,tid,update->ntimestep,
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
error->warning(FLERR,str,0); error->warning(FLERR,str,0);
fprintf(screen," 1st atom: %d %g %g %g\n", fprintf(screen," 1st atom: %d %g %g %g\n",
me,x[i1].x,x[i1].y,x[i1].z); me,x[i1].x,x[i1].y,x[i1].z);
fprintf(screen," 2nd atom: %d %g %g %g\n", fprintf(screen," 2nd atom: %d %g %g %g\n",
me,x[i2].x,x[i2].y,x[i2].z); me,x[i2].x,x[i2].y,x[i2].z);
fprintf(screen," 3rd atom: %d %g %g %g\n", fprintf(screen," 3rd atom: %d %g %g %g\n",
me,x[i3].x,x[i3].y,x[i3].z); me,x[i3].x,x[i3].y,x[i3].z);
fprintf(screen," 4th atom: %d %g %g %g\n", fprintf(screen," 4th atom: %d %g %g %g\n",
me,x[i4].x,x[i4].y,x[i4].z); me,x[i4].x,x[i4].y,x[i4].z);
} }
} }
#endif #endif
@ -283,14 +283,14 @@ void DihedralOPLSIntel::eval(const int vflag,
const flt_t sin_4phim = (flt_t)2.0 * cos_2phi * sin_2phim; const flt_t sin_4phim = (flt_t)2.0 * cos_2phi * sin_2phim;
flt_t p, pd; flt_t p, pd;
p = fc.bp[type].k1*((flt_t)1.0 + c) + p = fc.bp[type].k1*((flt_t)1.0 + c) +
fc.bp[type].k2*((flt_t)1.0 - cos_2phi) + fc.bp[type].k2*((flt_t)1.0 - cos_2phi) +
fc.bp[type].k3*((flt_t)1.0 + cos_3phi) + fc.bp[type].k3*((flt_t)1.0 + cos_3phi) +
fc.bp[type].k4*((flt_t)1.0 - cos_4phi) ; fc.bp[type].k4*((flt_t)1.0 - cos_4phi) ;
pd = fc.bp[type].k1 - pd = fc.bp[type].k1 -
(flt_t)2.0 * fc.bp[type].k2 * sin_2phim + (flt_t)2.0 * fc.bp[type].k2 * sin_2phim +
(flt_t)3.0 * fc.bp[type].k3 * sin_3phim - (flt_t)3.0 * fc.bp[type].k3 * sin_3phim -
(flt_t)4.0 * fc.bp[type].k4 * sin_4phim; (flt_t)4.0 * fc.bp[type].k4 * sin_4phim;
flt_t edihed; flt_t edihed;
if (EFLAG) edihed = p; if (EFLAG) edihed = p;
@ -327,18 +327,18 @@ void DihedralOPLSIntel::eval(const int vflag,
if (EFLAG || VFLAG) { if (EFLAG || VFLAG) {
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3, IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3,
i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal, vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
sv0, sv1, sv2, sv3, sv4, sv5); sv0, sv1, sv2, sv3, sv4, sv5);
#else #else
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3, IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3,
i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal, vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
ov0, ov1, ov2, ov3, ov4, ov5); ov0, ov1, ov2, ov3, ov4, ov5);
#endif #endif
} }
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
@ -346,35 +346,35 @@ void DihedralOPLSIntel::eval(const int vflag,
#endif #endif
{ {
if (NEWTON_BOND || i1 < nlocal) { if (NEWTON_BOND || i1 < nlocal) {
f[i1].x += f1x; f[i1].x += f1x;
f[i1].y += f1y; f[i1].y += f1y;
f[i1].z += f1z; f[i1].z += f1z;
} }
if (NEWTON_BOND || i2 < nlocal) { if (NEWTON_BOND || i2 < nlocal) {
f[i2].x += f2x; f[i2].x += f2x;
f[i2].y += f2y; f[i2].y += f2y;
f[i2].z += f2z; f[i2].z += f2z;
} }
if (NEWTON_BOND || i3 < nlocal) { if (NEWTON_BOND || i3 < nlocal) {
f[i3].x += f3x; f[i3].x += f3x;
f[i3].y += f3y; f[i3].y += f3y;
f[i3].z += f3z; f[i3].z += f3z;
} }
if (NEWTON_BOND || i4 < nlocal) { if (NEWTON_BOND || i4 < nlocal) {
f[i4].x += f4x; f[i4].x += f4x;
f[i4].y += f4y; f[i4].y += f4y;
f[i4].z += f4z; f[i4].z += f4z;
} }
} }
} // for n } // for n
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
if (EFLAG) oedihedral += sedihedral; if (EFLAG) oedihedral += sedihedral;
if (VFLAG && vflag) { if (VFLAG && vflag) {
ov0 += sv0; ov1 += sv1; ov2 += sv2; ov0 += sv0; ov1 += sv1; ov2 += sv2;
ov3 += sv3; ov4 += sv4; ov5 += sv5; ov3 += sv3; ov4 += sv4; ov5 += sv5;
} }
#endif #endif
} // omp parallel } // omp parallel
@ -422,7 +422,7 @@ void DihedralOPLSIntel::init_style()
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc, void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers) IntelBuffers<flt_t,acc_t> *buffers)
{ {
const int bp1 = atom->ndihedraltypes + 1; const int bp1 = atom->ndihedraltypes + 1;
fc.set_ntypes(bp1,memory); fc.set_ntypes(bp1,memory);
@ -439,11 +439,11 @@ void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc,
template <class flt_t> template <class flt_t>
void DihedralOPLSIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes, void DihedralOPLSIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
Memory *memory) { Memory *memory) {
if (nbondtypes != _nbondtypes) { if (nbondtypes != _nbondtypes) {
if (_nbondtypes > 0) if (_nbondtypes > 0)
_memory->destroy(bp); _memory->destroy(bp);
if (nbondtypes > 0) if (nbondtypes > 0)
_memory->create(bp,nbondtypes,"dihedralcharmmintel.bp"); _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
} }

View File

@ -44,8 +44,8 @@ class DihedralOPLSIntel : public DihedralOPLS {
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc, void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers); IntelBuffers<flt_t, acc_t> *buffers);

View File

@ -96,7 +96,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
_allow_separate_buffers = 1; _allow_separate_buffers = 1;
_offload_ghost = -1; _offload_ghost = -1;
_lrt = 0; _lrt = 0;
int iarg = 4; int iarg = 4;
while (iarg < narg) { while (iarg < narg) {
if (strcmp(arg[iarg],"omp") == 0) { if (strcmp(arg[iarg],"omp") == 0) {
@ -141,7 +141,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
else error->all(FLERR,"Illegal package intel command"); else error->all(FLERR,"Illegal package intel command");
iarg += 2; iarg += 2;
} }
// undocumented options // undocumented options
else if (strcmp(arg[iarg],"offload_affinity_balanced") == 0) { else if (strcmp(arg[iarg],"offload_affinity_balanced") == 0) {
@ -179,7 +179,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
_real_space_comm = MPI_COMM_WORLD; _real_space_comm = MPI_COMM_WORLD;
if (no_affinity == 0) if (no_affinity == 0)
if (set_host_affinity(nomp) != 0) if (set_host_affinity(nomp) != 0)
error->all(FLERR,"Could not set host affinity for offload tasks"); error->all(FLERR,"Could not set host affinity for offload tasks");
} }
int max_offload_threads = 0, offload_cores = 0; int max_offload_threads = 0, offload_cores = 0;
@ -264,7 +264,7 @@ FixIntel::~FixIntel()
double *time2 = off_watch_neighbor(); double *time2 = off_watch_neighbor();
int *overflow = get_off_overflow_flag(); int *overflow = get_off_overflow_flag();
if (_offload_balance != 0.0 && time1 != NULL && time2 != NULL && if (_offload_balance != 0.0 && time1 != NULL && time2 != NULL &&
overflow != NULL) { overflow != NULL) {
#pragma offload_transfer target(mic:_cop) \ #pragma offload_transfer target(mic:_cop) \
nocopy(time1,time2,overflow:alloc_if(0) free_if(1)) nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
} }
@ -320,11 +320,11 @@ void FixIntel::init()
if (strstr(hybrid->keywords[i], "/intel") != NULL) if (strstr(hybrid->keywords[i], "/intel") != NULL)
nstyles++; nstyles++;
else else
force->pair->no_virial_fdotr_compute = 1; force->pair->no_virial_fdotr_compute = 1;
} }
if (nstyles > 1) if (nstyles > 1)
error->all(FLERR, error->all(FLERR,
"Currently, cannot use more than one intel style with hybrid."); "Currently, cannot use more than one intel style with hybrid.");
check_neighbor_intel(); check_neighbor_intel();
int off_mode = 0; int off_mode = 0;
@ -349,13 +349,13 @@ void FixIntel::setup(int vflag)
{ {
if (neighbor->style != BIN) if (neighbor->style != BIN)
error->all(FLERR, error->all(FLERR,
"Currently, neighbor style BIN must be used with Intel package."); "Currently, neighbor style BIN must be used with Intel package.");
if (neighbor->exclude_setting() != 0) if (neighbor->exclude_setting() != 0)
error->all(FLERR, error->all(FLERR,
"Currently, cannot use neigh_modify exclude with Intel package."); "Currently, cannot use neigh_modify exclude with Intel package.");
if (vflag_atom) if (vflag_atom)
error->all(FLERR, error->all(FLERR,
"Cannot currently get per-atom virials with Intel package."); "Cannot currently get per-atom virials with Intel package.");
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
post_force(vflag); post_force(vflag);
#endif #endif
@ -392,7 +392,7 @@ void FixIntel::pair_init_check(const bool cdmessage)
double *time2 = off_watch_neighbor(); double *time2 = off_watch_neighbor();
int *overflow = get_off_overflow_flag(); int *overflow = get_off_overflow_flag();
if (_offload_balance !=0.0 && time1 != NULL && time2 != NULL && if (_offload_balance !=0.0 && time1 != NULL && time2 != NULL &&
overflow != NULL) { overflow != NULL) {
#pragma offload_transfer target(mic:_cop) \ #pragma offload_transfer target(mic:_cop) \
nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \ nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
in(overflow:length(5) alloc_if(1) free_if(0)) in(overflow:length(5) alloc_if(1) free_if(0))
@ -407,7 +407,7 @@ void FixIntel::pair_init_check(const bool cdmessage)
error->warning(FLERR, "Unknown Intel Compiler Version\n"); error->warning(FLERR, "Unknown Intel Compiler Version\n");
#else #else
if (__INTEL_COMPILER_BUILD_DATE != 20131008 && if (__INTEL_COMPILER_BUILD_DATE != 20131008 &&
__INTEL_COMPILER_BUILD_DATE < 20141023) __INTEL_COMPILER_BUILD_DATE < 20141023)
error->warning(FLERR, "Unsupported Intel Compiler."); error->warning(FLERR, "Unsupported Intel Compiler.");
#endif #endif
#if !defined(__INTEL_COMPILER) #if !defined(__INTEL_COMPILER)
@ -438,24 +438,24 @@ void FixIntel::pair_init_check(const bool cdmessage)
if (comm->me == 0) { if (comm->me == 0) {
if (screen) { if (screen) {
fprintf(screen, fprintf(screen,
"----------------------------------------------------------\n"); "----------------------------------------------------------\n");
if (_offload_balance != 0.0) { if (_offload_balance != 0.0) {
fprintf(screen,"Using Intel Coprocessor with %d threads per core, ", fprintf(screen,"Using Intel Coprocessor with %d threads per core, ",
_offload_tpc); _offload_tpc);
fprintf(screen,"%d threads per task\n",_offload_threads); fprintf(screen,"%d threads per task\n",_offload_threads);
} else { } else {
fprintf(screen,"Using Intel Package without Coprocessor.\n"); fprintf(screen,"Using Intel Package without Coprocessor.\n");
} }
fprintf(screen,"Precision: %s\n",kmode); fprintf(screen,"Precision: %s\n",kmode);
if (cdmessage) { if (cdmessage) {
#ifdef LMP_USE_AVXCD #ifdef LMP_USE_AVXCD
fprintf(screen,"AVX512 CD Optimizations: Enabled\n"); fprintf(screen,"AVX512 CD Optimizations: Enabled\n");
#else #else
fprintf(screen,"AVX512 CD Optimizations: Disabled\n"); fprintf(screen,"AVX512 CD Optimizations: Disabled\n");
#endif #endif
} }
fprintf(screen, fprintf(screen,
"----------------------------------------------------------\n"); "----------------------------------------------------------\n");
} }
} }
} }
@ -464,7 +464,7 @@ void FixIntel::pair_init_check(const bool cdmessage)
void FixIntel::bond_init_check() void FixIntel::bond_init_check()
{ {
if (_offload_balance != 0.0 && atom->molecular && if (_offload_balance != 0.0 && atom->molecular &&
force->newton_pair != force->newton_bond) force->newton_pair != force->newton_bond)
error->all(FLERR, error->all(FLERR,
"USER-INTEL package requires same setting for newton bond and non-bond."); "USER-INTEL package requires same setting for newton bond and non-bond.");
@ -573,7 +573,7 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
int o_range, f_stride; int o_range, f_stride;
if (force->newton_pair) if (force->newton_pair)
o_range = atom->nlocal + atom->nghost; o_range = atom->nlocal + atom->nghost;
else else
o_range = atom->nlocal; o_range = atom->nlocal;
IP_PRE_get_stride(f_stride, o_range, (sizeof(acc_t)*4), lmp->atom->torque); IP_PRE_get_stride(f_stride, o_range, (sizeof(acc_t)*4), lmp->atom->torque);
@ -588,18 +588,18 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
_use_simd_pragma("vector aligned") _use_simd_pragma("vector aligned")
_use_simd_pragma("simd") _use_simd_pragma("simd")
for (int n = 0; n < o_range; n++) for (int n = 0; n < o_range; n++)
f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
} else if (_nthreads == 2) { } else if (_nthreads == 2) {
_use_simd_pragma("vector aligned") _use_simd_pragma("vector aligned")
_use_simd_pragma("simd") _use_simd_pragma("simd")
for (int n = 0; n < o_range; n++) for (int n = 0; n < o_range; n++)
f_scalar[n] += f_scalar2[n]; f_scalar[n] += f_scalar2[n];
} else { } else {
acc_t *f_scalar3 = f_scalar2 + f_stride4; acc_t *f_scalar3 = f_scalar2 + f_stride4;
_use_simd_pragma("vector aligned") _use_simd_pragma("vector aligned")
_use_simd_pragma("simd") _use_simd_pragma("simd")
for (int n = 0; n < o_range; n++) for (int n = 0; n < o_range; n++)
f_scalar[n] += f_scalar2[n] + f_scalar3[n]; f_scalar[n] += f_scalar2[n] + f_scalar3[n];
} }
} else { } else {
#if defined(_OPENMP) #if defined(_OPENMP)
@ -608,13 +608,13 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
{ {
int iifrom, iito, tid; int iifrom, iito, tid;
IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads, IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads,
sizeof(acc_t)); sizeof(acc_t));
acc_t *f_scalar2 = f_scalar + f_stride4; acc_t *f_scalar2 = f_scalar + f_stride4;
for (int t = 1; t < _nthreads; t++) { for (int t = 1; t < _nthreads; t++) {
_use_simd_pragma("vector aligned") _use_simd_pragma("vector aligned")
_use_simd_pragma("simd") _use_simd_pragma("simd")
for (int n = iifrom; n < iito; n++) for (int n = iifrom; n < iito; n++)
f_scalar[n] += f_scalar2[n]; f_scalar[n] += f_scalar2[n];
f_scalar2 += f_stride4; f_scalar2 += f_stride4;
} }
@ -648,33 +648,33 @@ template <class ft, class acc_t>
void FixIntel::add_results(const ft * _noalias const f_in, void FixIntel::add_results(const ft * _noalias const f_in,
const acc_t * _noalias const ev_global, const acc_t * _noalias const ev_global,
const int eatom, const int vatom, const int eatom, const int vatom,
const int offload) { const int offload) {
start_watch(TIME_PACK); start_watch(TIME_PACK);
int f_length; int f_length;
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
if (_separate_buffers) { if (_separate_buffers) {
if (offload) { if (offload) {
if (force->newton_pair) { if (force->newton_pair) {
add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal); add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
const acc_t * _noalias const enull = 0; const acc_t * _noalias const enull = 0;
int offset = _offload_nlocal; int offset = _offload_nlocal;
if (atom->torque) offset *= 2; if (atom->torque) offset *= 2;
add_oresults(f_in + offset, enull, eatom, vatom, add_oresults(f_in + offset, enull, eatom, vatom,
_offload_min_ghost, _offload_nghost); _offload_min_ghost, _offload_nghost);
} else } else
add_oresults(f_in, ev_global, eatom, vatom, 0, offload_end_pair()); add_oresults(f_in, ev_global, eatom, vatom, 0, offload_end_pair());
} else { } else {
if (force->newton_pair) { if (force->newton_pair) {
add_oresults(f_in, ev_global, eatom, vatom, add_oresults(f_in, ev_global, eatom, vatom,
_host_min_local, _host_used_local); _host_min_local, _host_used_local);
const acc_t * _noalias const enull = 0; const acc_t * _noalias const enull = 0;
int offset = _host_used_local; int offset = _host_used_local;
if (atom->torque) offset *= 2; if (atom->torque) offset *= 2;
add_oresults(f_in + offset, enull, eatom, add_oresults(f_in + offset, enull, eatom,
vatom, _host_min_ghost, _host_used_ghost); vatom, _host_min_ghost, _host_used_ghost);
} else { } else {
int start = host_start_pair(); int start = host_start_pair();
add_oresults(f_in, ev_global, eatom, vatom, start, atom->nlocal-start); add_oresults(f_in, ev_global, eatom, vatom, start, atom->nlocal-start);
} }
} }
stop_watch(TIME_PACK); stop_watch(TIME_PACK);
@ -685,9 +685,9 @@ void FixIntel::add_results(const ft * _noalias const f_in,
start = 0; start = 0;
if (force->newton_pair) { if (force->newton_pair) {
if (_offload_noghost == 0) if (_offload_noghost == 0)
f_length = atom->nlocal + atom->nghost; f_length = atom->nlocal + atom->nghost;
else else
f_length = atom->nlocal; f_length = atom->nlocal;
} else } else
f_length = offload_end_pair(); f_length = offload_end_pair();
} else { } else {
@ -714,9 +714,9 @@ void FixIntel::add_results(const ft * _noalias const f_in,
template <class ft, class acc_t> template <class ft, class acc_t>
void FixIntel::add_oresults(const ft * _noalias const f_in, void FixIntel::add_oresults(const ft * _noalias const f_in,
const acc_t * _noalias const ev_global, const acc_t * _noalias const ev_global,
const int eatom, const int vatom, const int eatom, const int vatom,
const int out_offset, const int nall) { const int out_offset, const int nall) {
lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset; lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
if (atom->torque) { if (atom->torque) {
if (f_in[1].w) if (f_in[1].w)
@ -744,12 +744,12 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
if (atom->torque) { if (atom->torque) {
int ii = ifrom * 2; int ii = ifrom * 2;
lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] + lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
out_offset; out_offset;
if (eatom) { if (eatom) {
double * _noalias const lmp_eatom = force->pair->eatom + out_offset; double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma novector #pragma novector
#endif #endif
for (int i = ifrom; i < ito; i++) { for (int i = ifrom; i < ito; i++) {
f[i].x += f_in[ii].x; f[i].x += f_in[ii].x;
f[i].y += f_in[ii].y; f[i].y += f_in[ii].y;
@ -762,8 +762,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
} }
} else { } else {
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma novector #pragma novector
#endif #endif
for (int i = ifrom; i < ito; i++) { for (int i = ifrom; i < ito; i++) {
f[i].x += f_in[ii].x; f[i].x += f_in[ii].x;
f[i].y += f_in[ii].y; f[i].y += f_in[ii].y;
@ -776,10 +776,10 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
} }
} else { } else {
if (eatom) { if (eatom) {
double * _noalias const lmp_eatom = force->pair->eatom + out_offset; double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma novector #pragma novector
#endif #endif
for (int i = ifrom; i < ito; i++) { for (int i = ifrom; i < ito; i++) {
f[i].x += f_in[i].x; f[i].x += f_in[i].x;
f[i].y += f_in[i].y; f[i].y += f_in[i].y;
@ -788,8 +788,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
} }
} else { } else {
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma novector #pragma novector
#endif #endif
for (int i = ifrom; i < ito; i++) { for (int i = ifrom; i < ito; i++) {
f[i].x += f_in[i].x; f[i].x += f_in[i].x;
f[i].y += f_in[i].y; f[i].y += f_in[i].y;
@ -931,7 +931,7 @@ void FixIntel::output_timing_data() {
balance_out[0] = _balance_pair; balance_out[0] = _balance_pair;
balance_out[1] = _balance_neighbor; balance_out[1] = _balance_neighbor;
MPI_Reduce(balance_out, balance_in, 2, MPI_DOUBLE, MPI_SUM, MPI_Reduce(balance_out, balance_in, 2, MPI_DOUBLE, MPI_SUM,
0, _real_space_comm); 0, _real_space_comm);
balance_in[0] /= size; balance_in[0] /= size;
balance_in[1] /= size; balance_in[1] /= size;
@ -958,25 +958,25 @@ void FixIntel::output_timing_data() {
balance_in[1]); balance_in[1]);
fprintf(_tscreen, " Offload Pair Balance %f\n", fprintf(_tscreen, " Offload Pair Balance %f\n",
balance_in[0]); balance_in[0]);
fprintf(_tscreen, " Offload Ghost Atoms "); fprintf(_tscreen, " Offload Ghost Atoms ");
if (_offload_noghost) fprintf(_tscreen,"No\n"); if (_offload_noghost) fprintf(_tscreen,"No\n");
else fprintf(_tscreen,"Yes\n"); else fprintf(_tscreen,"Yes\n");
#ifdef TIME_BALANCE #ifdef TIME_BALANCE
fprintf(_tscreen, " Offload Imbalance Seconds %f\n", fprintf(_tscreen, " Offload Imbalance Seconds %f\n",
timers[TIME_IMBALANCE]); timers[TIME_IMBALANCE]);
fprintf(_tscreen, " Offload Min/Max Seconds "); fprintf(_tscreen, " Offload Min/Max Seconds ");
for (int i = 0; i < NUM_ITIMERS; i++) for (int i = 0; i < NUM_ITIMERS; i++)
fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]); fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]);
fprintf(_tscreen, "\n"); fprintf(_tscreen, "\n");
#endif #endif
double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] + double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] +
timers[TIME_OFFLOAD_WAIT]; timers[TIME_OFFLOAD_WAIT];
double ct = timers[TIME_OFFLOAD_NEIGHBOR] + double ct = timers[TIME_OFFLOAD_NEIGHBOR] +
timers[TIME_OFFLOAD_PAIR]; timers[TIME_OFFLOAD_PAIR];
double tt = MAX(ht,ct); double tt = MAX(ht,ct);
if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0) if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0)
error->warning(FLERR, error->warning(FLERR,
"Leaving a core free can improve performance for offload"); "Leaving a core free can improve performance for offload");
} }
fprintf(_tscreen, "------------------------------------------------\n"); fprintf(_tscreen, "------------------------------------------------\n");
} }
@ -999,14 +999,14 @@ int FixIntel::get_ppn(int &node_rank) {
node_name[name_length] = '\0'; node_name[name_length] = '\0';
char *node_names = new char[MPI_MAX_PROCESSOR_NAME*nprocs]; char *node_names = new char[MPI_MAX_PROCESSOR_NAME*nprocs];
MPI_Allgather(node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, node_names, MPI_Allgather(node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, node_names,
MPI_MAX_PROCESSOR_NAME, MPI_CHAR, _real_space_comm); MPI_MAX_PROCESSOR_NAME, MPI_CHAR, _real_space_comm);
int ppn = 0; int ppn = 0;
node_rank = 0; node_rank = 0;
for (int i = 0; i < nprocs; i++) { for (int i = 0; i < nprocs; i++) {
if (strcmp(node_name, node_names + i * MPI_MAX_PROCESSOR_NAME) == 0) { if (strcmp(node_name, node_names + i * MPI_MAX_PROCESSOR_NAME) == 0) {
ppn++; ppn++;
if (i < rank) if (i < rank)
node_rank++; node_rank++;
} }
} }
@ -1068,19 +1068,19 @@ void FixIntel::set_offload_affinity()
kmp_create_affinity_mask(&mask); kmp_create_affinity_mask(&mask);
int proc = offload_threads * node_rank + tnum; int proc = offload_threads * node_rank + tnum;
#ifdef __AVX512F__ #ifdef __AVX512F__
proc = (proc / offload_tpc) + (proc % offload_tpc) * proc = (proc / offload_tpc) + (proc % offload_tpc) *
((offload_cores) / 4); ((offload_cores) / 4);
proc += 68; proc += 68;
#else #else
if (offload_affinity_balanced) if (offload_affinity_balanced)
proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1; proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1;
else else
proc += (proc / 4) * (4 - offload_tpc) + 1; proc += (proc / 4) * (4 - offload_tpc) + 1;
#endif #endif
kmp_set_affinity_mask_proc(proc, &mask); kmp_set_affinity_mask_proc(proc, &mask);
if (kmp_set_affinity(&mask) != 0) if (kmp_set_affinity(&mask) != 0)
printf("Could not set affinity on rank %d thread %d to %d\n", printf("Could not set affinity on rank %d thread %d to %d\n",
node_rank, tnum, proc); node_rank, tnum, proc);
} }
} }
@ -1110,7 +1110,7 @@ int FixIntel::set_host_affinity(const int nomp)
char cmd[512]; char cmd[512];
char readbuf[INTEL_MAX_HOST_CORE_COUNT*5]; char readbuf[INTEL_MAX_HOST_CORE_COUNT*5];
sprintf(cmd, "lscpu -p | grep -v '#' |" sprintf(cmd, "lscpu -p | grep -v '#' |"
"sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'"); "sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'");
p = popen(cmd, "r"); p = popen(cmd, "r");
if (p == NULL) return -1; if (p == NULL) return -1;
ncores = 0; ncores = 0;
@ -1147,7 +1147,7 @@ int FixIntel::set_host_affinity(const int nomp)
if (subscription > ncores) { if (subscription > ncores) {
if (rank == 0) if (rank == 0)
error->warning(FLERR, error->warning(FLERR,
"More MPI tasks/OpenMP threads than available cores"); "More MPI tasks/OpenMP threads than available cores");
return 0; return 0;
} }
if (subscription == ncores) if (subscription == ncores)
@ -1173,10 +1173,10 @@ int FixIntel::set_host_affinity(const int nomp)
int first = coi_cores + node_rank * mpi_cores; int first = coi_cores + node_rank * mpi_cores;
CPU_ZERO(&cpuset); CPU_ZERO(&cpuset);
for (int i = first; i < first + mpi_cores; i++) for (int i = first; i < first + mpi_cores; i++)
CPU_SET(proc_list[i], &cpuset); CPU_SET(proc_list[i], &cpuset);
if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) { if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
fail = 1; fail = 1;
break; break;
} }
plwp++; plwp++;
} }
@ -1189,13 +1189,13 @@ int FixIntel::set_host_affinity(const int nomp)
buf1 = (float*) malloc(sizeof(float)*pragma_size); buf1 = (float*) malloc(sizeof(float)*pragma_size);
#pragma offload target (mic:0) mandatory \ #pragma offload target (mic:0) mandatory \
in(buf1:length(pragma_size) alloc_if(1) free_if(0)) \ in(buf1:length(pragma_size) alloc_if(1) free_if(0)) \
signal(&sig1) signal(&sig1)
{ buf1[0] = 0.0; } { buf1[0] = 0.0; }
#pragma offload_wait target(mic:0) wait(&sig1) #pragma offload_wait target(mic:0) wait(&sig1)
#pragma offload target (mic:0) mandatory \ #pragma offload target (mic:0) mandatory \
out(buf1:length(pragma_size) alloc_if(0) free_if(1)) \ out(buf1:length(pragma_size) alloc_if(0) free_if(1)) \
signal(&sig2) signal(&sig2)
{ buf1[0] = 1.0; } { buf1[0] = 1.0; }
#pragma offload_wait target(mic:0) wait(&sig2) #pragma offload_wait target(mic:0) wait(&sig2)
@ -1211,11 +1211,11 @@ int FixIntel::set_host_affinity(const int nomp)
CPU_ZERO(&cpuset); CPU_ZERO(&cpuset);
for(int i=0; i<coi_cores; i++) for(int i=0; i<coi_cores; i++)
CPU_SET(proc_list[i], &cpuset); CPU_SET(proc_list[i], &cpuset);
if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) { if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
fail = 1; fail = 1;
break; break;
} }
} }
pclose(p); pclose(p);
@ -1228,7 +1228,7 @@ int FixIntel::set_host_affinity(const int nomp)
if (screen && rank == 0) { if (screen && rank == 0) {
if (coi_cores) if (coi_cores)
fprintf(screen,"Intel Package: Affinitizing %d Offload Threads to %d Cores\n", fprintf(screen,"Intel Package: Affinitizing %d Offload Threads to %d Cores\n",
mlwp, coi_cores); mlwp, coi_cores);
fprintf(screen,"Intel Package: Affinitizing MPI Tasks to %d Cores Each\n",mpi_cores); fprintf(screen,"Intel Package: Affinitizing MPI Tasks to %d Cores Each\n",mpi_cores);
} }
if (fail) return -1; if (fail) return -1;

View File

@ -72,7 +72,7 @@ class FixIntel : public Fix {
inline void nbor_pack_width(const int w) { _nbor_pack_width = w; } inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
inline int three_body_neighbor() { return _three_body_neighbor; } inline int three_body_neighbor() { return _three_body_neighbor; }
inline void three_body_neighbor(const int i) { _three_body_neighbor = 1; } inline void three_body_neighbor(const int i) { _three_body_neighbor = 1; }
inline int need_zero(const int tid) { inline int need_zero(const int tid) {
if (_need_reduce == 0 && tid > 0) return 1; if (_need_reduce == 0 && tid > 0) return 1;
return 0; return 0;
@ -84,11 +84,11 @@ class FixIntel : public Fix {
} }
inline int pppm_table() { inline int pppm_table() {
if (force->kspace_match("pppm/intel", 0) || if (force->kspace_match("pppm/intel", 0) ||
force->kspace_match("pppm/disp/intel",0)) force->kspace_match("pppm/disp/intel",0))
return INTEL_P3M_TABLE; return INTEL_P3M_TABLE;
else return 0; else return 0;
} }
protected: protected:
IntelBuffers<float,float> *_single_buffers; IntelBuffers<float,float> *_single_buffers;
@ -103,17 +103,17 @@ class FixIntel : public Fix {
inline void add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in, inline void add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
double *ev_in, const int offload, double *ev_in, const int offload,
const int eatom = 0, const int vatom = 0, const int eatom = 0, const int vatom = 0,
const int rflag = 0); const int rflag = 0);
inline void add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in, inline void add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
double *ev_in, const int offload, double *ev_in, const int offload,
const int eatom = 0, const int vatom = 0, const int eatom = 0, const int vatom = 0,
const int rflag = 0); const int rflag = 0);
inline void add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in, inline void add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
float *ev_in, const int offload, float *ev_in, const int offload,
const int eatom = 0, const int vatom = 0, const int eatom = 0, const int vatom = 0,
const int rflag = 0); const int rflag = 0);
inline void get_buffern(const int offload, int &nlocal, int &nall, inline void get_buffern(const int offload, int &nlocal, int &nall,
int &minlocal); int &minlocal);
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
void post_force(int vflag); void post_force(int vflag);
@ -213,13 +213,13 @@ class FixIntel : public Fix {
inline void add_results(const ft * _noalias const f_in, inline void add_results(const ft * _noalias const f_in,
const acc_t * _noalias const ev_global, const acc_t * _noalias const ev_global,
const int eatom, const int vatom, const int eatom, const int vatom,
const int offload); const int offload);
template <class ft, class acc_t> template <class ft, class acc_t>
inline void add_oresults(const ft * _noalias const f_in, inline void add_oresults(const ft * _noalias const f_in,
const acc_t * _noalias const ev_global, const acc_t * _noalias const ev_global,
const int eatom, const int vatom, const int eatom, const int vatom,
const int out_offset, const int nall); const int out_offset, const int nall);
int _offload_affinity_balanced, _offload_threads, _offload_tpc; int _offload_affinity_balanced, _offload_threads, _offload_tpc;
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
@ -235,16 +235,16 @@ class FixIntel : public Fix {
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
void FixIntel::get_buffern(const int offload, int &nlocal, int &nall, void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
int &minlocal) { int &minlocal) {
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
if (_separate_buffers) { if (_separate_buffers) {
if (offload) { if (offload) {
if (neighbor->ago != 0) { if (neighbor->ago != 0) {
nlocal = _offload_nlocal; nlocal = _offload_nlocal;
nall = _offload_nall; nall = _offload_nall;
} else { } else {
nlocal = atom->nlocal; nlocal = atom->nlocal;
nall = nlocal + atom->nghost; nall = nlocal + atom->nghost;
} }
minlocal = 0; minlocal = 0;
} else { } else {
@ -253,7 +253,7 @@ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
if (force->newton) if (force->newton)
minlocal = _host_min_local; minlocal = _host_min_local;
else else
minlocal = host_start_pair(); minlocal = host_start_pair();
} }
return; return;
} }
@ -271,7 +271,7 @@ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in, void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
double *ev_in, const int offload, double *ev_in, const int offload,
const int eatom, const int vatom, const int eatom, const int vatom,
const int rflag) { const int rflag) {
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
if (offload) { if (offload) {
_off_results_eatom = eatom; _off_results_eatom = eatom;
@ -299,7 +299,7 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in, void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
double *ev_in, const int offload, double *ev_in, const int offload,
const int eatom, const int vatom, const int eatom, const int vatom,
const int rflag) { const int rflag) {
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
if (offload) { if (offload) {
_off_results_eatom = eatom; _off_results_eatom = eatom;
@ -361,12 +361,12 @@ int FixIntel::offload_end_neighbor() {
if (atom->nlocal < 2) if (atom->nlocal < 2)
error->one(FLERR,"Too few atoms for load balancing offload"); error->one(FLERR,"Too few atoms for load balancing offload");
double granularity = 1.0 / atom->nlocal; double granularity = 1.0 / atom->nlocal;
if (_balance_neighbor < granularity) if (_balance_neighbor < granularity)
_balance_neighbor = granularity + 1e-10; _balance_neighbor = granularity + 1e-10;
else if (_balance_neighbor > 1.0 - granularity) else if (_balance_neighbor > 1.0 - granularity)
_balance_neighbor = 1.0 - granularity + 1e-10; _balance_neighbor = 1.0 - granularity + 1e-10;
} }
return _balance_neighbor * atom->nlocal; return _balance_neighbor * atom->nlocal;
} }
int FixIntel::offload_end_pair() { int FixIntel::offload_end_pair() {
@ -517,7 +517,7 @@ The newton setting must be the same for both pairwise and bonded forces.
E: Intel styles for bond/angle/dihedral/improper require intel pair style." E: Intel styles for bond/angle/dihedral/improper require intel pair style."
You cannot use the USER-INTEL package for bond calculations without a You cannot use the USER-INTEL package for bond calculations without a
USER-INTEL supported pair style. USER-INTEL supported pair style.
E: Intel styles for kspace require intel pair style. E: Intel styles for kspace require intel pair style.

View File

@ -45,7 +45,7 @@ typedef struct { double x,y,z; } dbl3_t;
NVT,NPH,NPT integrators for improved Nose-Hoover equations of motion NVT,NPH,NPT integrators for improved Nose-Hoover equations of motion
---------------------------------------------------------------------- */ ---------------------------------------------------------------------- */
FixNHIntel::FixNHIntel(LAMMPS *lmp, int narg, char **arg) : FixNHIntel::FixNHIntel(LAMMPS *lmp, int narg, char **arg) :
FixNH(lmp, narg, arg) FixNH(lmp, narg, arg)
{ {
_dtfm = 0; _dtfm = 0;
@ -118,12 +118,12 @@ void FixNHIntel::remap()
#endif #endif
for (int i = 0; i < nlocal; i++) { for (int i = 0; i < nlocal; i++) {
if (mask[i] & dilate_group_bit) { if (mask[i] & dilate_group_bit) {
const double d0 = x[i].x - b0; const double d0 = x[i].x - b0;
const double d1 = x[i].y - b1; const double d1 = x[i].y - b1;
const double d2 = x[i].z - b2; const double d2 = x[i].z - b2;
x[i].x = hi0*d0 + hi5*d1 + hi4*d2; x[i].x = hi0*d0 + hi5*d1 + hi4*d2;
x[i].y = hi1*d1 + hi3*d2; x[i].y = hi1*d1 + hi3*d2;
x[i].z = hi2*d2; x[i].z = hi2*d2;
} }
} }
} }
@ -294,9 +294,9 @@ void FixNHIntel::remap()
#endif #endif
for (int i = 0; i < nlocal; i++) { for (int i = 0; i < nlocal; i++) {
if (mask[i] & dilate_group_bit) { if (mask[i] & dilate_group_bit) {
x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0; x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
x[i].y = h1*x[i].y + h3*x[i].z + nb1; x[i].y = h1*x[i].y + h3*x[i].z + nb1;
x[i].z = h2*x[i].z + nb2; x[i].z = h2*x[i].z + nb2;
} }
} }
} }
@ -318,7 +318,7 @@ void FixNHIntel::reset_dt()
dto = dthalf; dto = dthalf;
// If using respa, then remap is performed in innermost level // If using respa, then remap is performed in innermost level
if (strstr(update->integrate_style,"respa")) if (strstr(update->integrate_style,"respa"))
dto = 0.5*step_respa[0]; dto = 0.5*step_respa[0];
@ -329,7 +329,7 @@ void FixNHIntel::reset_dt()
tdrag_factor = 1.0 - (update->dt * t_freq * drag / nc_tchain); tdrag_factor = 1.0 - (update->dt * t_freq * drag / nc_tchain);
const int * const mask = atom->mask; const int * const mask = atom->mask;
const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
atom->nlocal; atom->nlocal;
if (nlocal > _nlocal_max) { if (nlocal > _nlocal_max) {
@ -345,9 +345,9 @@ void FixNHIntel::reset_dt()
const double * const rmass = atom->rmass; const double * const rmass = atom->rmass;
int n = 0; int n = 0;
for (int i = 0; i < nlocal; i++) { for (int i = 0; i < nlocal; i++) {
_dtfm[n++] = dtf / rmass[i]; _dtfm[n++] = dtf / rmass[i];
_dtfm[n++] = dtf / rmass[i]; _dtfm[n++] = dtf / rmass[i];
_dtfm[n++] = dtf / rmass[i]; _dtfm[n++] = dtf / rmass[i];
} }
} else { } else {
const double * const mass = atom->mass; const double * const mass = atom->mass;
@ -364,29 +364,29 @@ void FixNHIntel::reset_dt()
const double * const rmass = atom->rmass; const double * const rmass = atom->rmass;
int n = 0; int n = 0;
for (int i = 0; i < nlocal; i++) for (int i = 0; i < nlocal; i++)
if (mask[i] & groupbit) { if (mask[i] & groupbit) {
_dtfm[n++] = dtf / rmass[i]; _dtfm[n++] = dtf / rmass[i];
_dtfm[n++] = dtf / rmass[i]; _dtfm[n++] = dtf / rmass[i];
_dtfm[n++] = dtf / rmass[i]; _dtfm[n++] = dtf / rmass[i];
} else { } else {
_dtfm[n++] = 0.0; _dtfm[n++] = 0.0;
_dtfm[n++] = 0.0; _dtfm[n++] = 0.0;
_dtfm[n++] = 0.0; _dtfm[n++] = 0.0;
} }
} else { } else {
const double * const mass = atom->mass; const double * const mass = atom->mass;
const int * const type = atom->type; const int * const type = atom->type;
int n = 0; int n = 0;
for (int i = 0; i < nlocal; i++) for (int i = 0; i < nlocal; i++)
if (mask[i] & groupbit) { if (mask[i] & groupbit) {
_dtfm[n++] = dtf / mass[type[i]]; _dtfm[n++] = dtf / mass[type[i]];
_dtfm[n++] = dtf / mass[type[i]]; _dtfm[n++] = dtf / mass[type[i]];
_dtfm[n++] = dtf / mass[type[i]]; _dtfm[n++] = dtf / mass[type[i]];
} else { } else {
_dtfm[n++] = 0.0; _dtfm[n++] = 0.0;
_dtfm[n++] = 0.0; _dtfm[n++] = 0.0;
_dtfm[n++] = 0.0; _dtfm[n++] = 0.0;
} }
} }
} }
} }
@ -431,9 +431,9 @@ void FixNHIntel::nh_v_press()
#endif #endif
for (int i = 0; i < nlocal; i++) { for (int i = 0; i < nlocal; i++) {
if (mask[i] & groupbit) { if (mask[i] & groupbit) {
v[i].x *= f0; v[i].x *= f0;
v[i].y *= f1; v[i].y *= f1;
v[i].z *= f2; v[i].z *= f2;
} }
} }
} }
@ -506,7 +506,7 @@ void FixNHIntel::nh_v_temp()
#pragma simd #pragma simd
#endif #endif
for (int i = 0; i < _nlocal3; i++) for (int i = 0; i < _nlocal3; i++)
v[i] *= factor_eta; v[i] *= factor_eta;
} else { } else {
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
@ -514,12 +514,12 @@ void FixNHIntel::nh_v_temp()
#endif #endif
for (int i = 0; i < _nlocal3; i++) { for (int i = 0; i < _nlocal3; i++) {
if (_dtfm[i] != 0.0) if (_dtfm[i] != 0.0)
v[i] *= factor_eta; v[i] *= factor_eta;
} }
} }
} }
double FixNHIntel::memory_usage() double FixNHIntel::memory_usage()
{ {
return FixNH::memory_usage() + _nlocal_max * 3 * sizeof(double); return FixNH::memory_usage() + _nlocal_max * 3 * sizeof(double);
} }

View File

@ -35,7 +35,7 @@ class FixNHIntel : public FixNH {
int _nlocal3, _nlocal_max; int _nlocal3, _nlocal_max;
virtual void remap(); virtual void remap();
virtual void nve_x(); virtual void nve_x();
virtual void nve_v(); virtual void nve_v();
virtual void nh_v_press(); virtual void nh_v_press();
virtual void nh_v_temp(); virtual void nh_v_temp();

View File

@ -36,7 +36,7 @@ using namespace FixConst;
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
FixNVEAsphereIntel::FixNVEAsphereIntel(LAMMPS *lmp, int narg, char **arg) : FixNVEAsphereIntel::FixNVEAsphereIntel(LAMMPS *lmp, int narg, char **arg) :
FixNVE(lmp, narg, arg) FixNVE(lmp, narg, arg)
{ {
_dtfm = 0; _dtfm = 0;
_nlocal3 = 0; _nlocal3 = 0;
@ -129,9 +129,9 @@ void FixNVEAsphereIntel::initial_integrate(int vflag)
#endif #endif
for (int i = 0; i < nlocal; i++) { for (int i = 0; i < nlocal; i++) {
if (mask[i] & groupbit) { if (mask[i] & groupbit) {
double *quat = bonus[ellipsoid[i]].quat; double *quat = bonus[ellipsoid[i]].quat;
ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i], ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i],
_inertia1[i], _inertia2[i]); _inertia1[i], _inertia2[i]);
} }
} }
} }
@ -168,7 +168,7 @@ void FixNVEAsphereIntel::reset_dt() {
dtf = 0.5 * update->dt * force->ftm2v; dtf = 0.5 * update->dt * force->ftm2v;
const int * const mask = atom->mask; const int * const mask = atom->mask;
const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
atom->nlocal; atom->nlocal;
if (nlocal > _nlocal_max) { if (nlocal > _nlocal_max) {
@ -211,27 +211,27 @@ void FixNVEAsphereIntel::reset_dt() {
for (int i = 0; i < nlocal; i++) { for (int i = 0; i < nlocal; i++) {
if (mask[i] & groupbit) { if (mask[i] & groupbit) {
_dtfm[n++] = dtf / rmass[i]; _dtfm[n++] = dtf / rmass[i];
_dtfm[n++] = dtf / rmass[i]; _dtfm[n++] = dtf / rmass[i];
_dtfm[n++] = dtf / rmass[i]; _dtfm[n++] = dtf / rmass[i];
double *shape = bonus[ellipsoid[i]].shape; double *shape = bonus[ellipsoid[i]].shape;
double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]); double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
if (idot != 0.0) idot = 1.0 / idot; if (idot != 0.0) idot = 1.0 / idot;
_inertia0[i] = idot; _inertia0[i] = idot;
idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]); idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
if (idot != 0.0) idot = 1.0 / idot; if (idot != 0.0) idot = 1.0 / idot;
_inertia1[i] = idot; _inertia1[i] = idot;
idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]); idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
if (idot != 0.0) idot = 1.0 / idot; if (idot != 0.0) idot = 1.0 / idot;
_inertia2[i] = idot; _inertia2[i] = idot;
} else { } else {
_dtfm[n++] = 0.0; _dtfm[n++] = 0.0;
_dtfm[n++] = 0.0; _dtfm[n++] = 0.0;
_dtfm[n++] = 0.0; _dtfm[n++] = 0.0;
} }
} }
} }
} }
double FixNVEAsphereIntel::memory_usage() double FixNVEAsphereIntel::memory_usage()
{ {
return FixNVE::memory_usage() + _nlocal_max * 12 * sizeof(double); return FixNVE::memory_usage() + _nlocal_max * 12 * sizeof(double);
} }

View File

@ -29,7 +29,7 @@ using namespace FixConst;
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
FixNVEIntel::FixNVEIntel(LAMMPS *lmp, int narg, char **arg) : FixNVEIntel::FixNVEIntel(LAMMPS *lmp, int narg, char **arg) :
FixNVE(lmp, narg, arg) FixNVE(lmp, narg, arg)
{ {
_dtfm = 0; _dtfm = 0;
_nlocal3 = 0; _nlocal3 = 0;
@ -91,7 +91,7 @@ void FixNVEIntel::initial_integrate(int vflag)
for (int i = 0; i < _nlocal3; i++) { for (int i = 0; i < _nlocal3; i++) {
if (_dtfm[i] != 0.0) { if (_dtfm[i] != 0.0) {
v[i] += _dtfm[i] * f[i]; v[i] += _dtfm[i] * f[i];
x[i] += dtv * v[i]; x[i] += dtv * v[i];
} }
} }
} }
@ -130,7 +130,7 @@ void FixNVEIntel::reset_dt() {
dtf = 0.5 * update->dt * force->ftm2v; dtf = 0.5 * update->dt * force->ftm2v;
const int * const mask = atom->mask; const int * const mask = atom->mask;
const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
atom->nlocal; atom->nlocal;
if (nlocal > _nlocal_max) { if (nlocal > _nlocal_max) {
@ -146,9 +146,9 @@ void FixNVEIntel::reset_dt() {
const double * const rmass = atom->rmass; const double * const rmass = atom->rmass;
int n = 0; int n = 0;
for (int i = 0; i < nlocal; i++) { for (int i = 0; i < nlocal; i++) {
_dtfm[n++] = dtf / rmass[i]; _dtfm[n++] = dtf / rmass[i];
_dtfm[n++] = dtf / rmass[i]; _dtfm[n++] = dtf / rmass[i];
_dtfm[n++] = dtf / rmass[i]; _dtfm[n++] = dtf / rmass[i];
} }
} else { } else {
const double * const mass = atom->mass; const double * const mass = atom->mass;
@ -165,34 +165,34 @@ void FixNVEIntel::reset_dt() {
const double * const rmass = atom->rmass; const double * const rmass = atom->rmass;
int n = 0; int n = 0;
for (int i = 0; i < nlocal; i++) for (int i = 0; i < nlocal; i++)
if (mask[i] & groupbit) { if (mask[i] & groupbit) {
_dtfm[n++] = dtf / rmass[i]; _dtfm[n++] = dtf / rmass[i];
_dtfm[n++] = dtf / rmass[i]; _dtfm[n++] = dtf / rmass[i];
_dtfm[n++] = dtf / rmass[i]; _dtfm[n++] = dtf / rmass[i];
} else { } else {
_dtfm[n++] = 0.0; _dtfm[n++] = 0.0;
_dtfm[n++] = 0.0; _dtfm[n++] = 0.0;
_dtfm[n++] = 0.0; _dtfm[n++] = 0.0;
} }
} else { } else {
const double * const mass = atom->mass; const double * const mass = atom->mass;
const int * const type = atom->type; const int * const type = atom->type;
int n = 0; int n = 0;
for (int i = 0; i < nlocal; i++) for (int i = 0; i < nlocal; i++)
if (mask[i] & groupbit) { if (mask[i] & groupbit) {
_dtfm[n++] = dtf / mass[type[i]]; _dtfm[n++] = dtf / mass[type[i]];
_dtfm[n++] = dtf / mass[type[i]]; _dtfm[n++] = dtf / mass[type[i]];
_dtfm[n++] = dtf / mass[type[i]]; _dtfm[n++] = dtf / mass[type[i]];
} else { } else {
_dtfm[n++] = 0.0; _dtfm[n++] = 0.0;
_dtfm[n++] = 0.0; _dtfm[n++] = 0.0;
_dtfm[n++] = 0.0; _dtfm[n++] = 0.0;
} }
} }
} }
} }
double FixNVEIntel::memory_usage() double FixNVEIntel::memory_usage()
{ {
return FixNVE::memory_usage() + _nlocal_max * 3 * sizeof(double); return FixNVE::memory_usage() + _nlocal_max * 3 * sizeof(double);
} }

View File

@ -42,7 +42,7 @@ typedef struct { int a,b,c,d,t; } int5_t;
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
ImproperCvffIntel::ImproperCvffIntel(LAMMPS *lmp) : ImproperCvffIntel::ImproperCvffIntel(LAMMPS *lmp) :
ImproperCvff(lmp) ImproperCvff(lmp)
{ {
suffix_flag |= Suffix::INTEL; suffix_flag |= Suffix::INTEL;
@ -80,8 +80,8 @@ void ImproperCvffIntel::compute(int eflag, int vflag)
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void ImproperCvffIntel::compute(int eflag, int vflag, void ImproperCvffIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
if (eflag || vflag) ev_setup(eflag,vflag); if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = 0; else evflag = 0;
@ -89,14 +89,14 @@ void ImproperCvffIntel::compute(int eflag, int vflag,
if (evflag) { if (evflag) {
if (vflag && !eflag) { if (vflag && !eflag) {
if (force->newton_bond) if (force->newton_bond)
eval<0,1,1>(vflag, buffers, fc); eval<0,1,1>(vflag, buffers, fc);
else else
eval<0,1,0>(vflag, buffers, fc); eval<0,1,0>(vflag, buffers, fc);
} else { } else {
if (force->newton_bond) if (force->newton_bond)
eval<1,1,1>(vflag, buffers, fc); eval<1,1,1>(vflag, buffers, fc);
else else
eval<1,1,0>(vflag, buffers, fc); eval<1,1,0>(vflag, buffers, fc);
} }
} else { } else {
if (force->newton_bond) if (force->newton_bond)
@ -109,9 +109,9 @@ void ImproperCvffIntel::compute(int eflag, int vflag,
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void ImproperCvffIntel::eval(const int vflag, void ImproperCvffIntel::eval(const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
const int inum = neighbor->nimproperlist; const int inum = neighbor->nimproperlist;
if (inum == 0) return; if (inum == 0) return;
@ -153,7 +153,7 @@ void ImproperCvffIntel::eval(const int vflag,
if (fix->need_zero(tid)) if (fix->need_zero(tid))
memset(f, 0, f_stride * sizeof(FORCE_T)); memset(f, 0, f_stride * sizeof(FORCE_T));
const int5_t * _noalias const improperlist = const int5_t * _noalias const improperlist =
(int5_t *) neighbor->improperlist[0]; (int5_t *) neighbor->improperlist[0];
#ifdef LMP_INTEL_USE_SIMDOFF_FIX #ifdef LMP_INTEL_USE_SIMDOFF_FIX
@ -230,22 +230,22 @@ void ImproperCvffIntel::eval(const int vflag,
#ifndef LMP_INTEL_USE_SIMDOFF_FIX #ifndef LMP_INTEL_USE_SIMDOFF_FIX
if (c > PTOLERANCE || c < MTOLERANCE) { if (c > PTOLERANCE || c < MTOLERANCE) {
int me; int me;
MPI_Comm_rank(world,&me); MPI_Comm_rank(world,&me);
if (screen) { if (screen) {
char str[128]; char str[128];
sprintf(str,"Improper problem: %d " BIGINT_FORMAT " " sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT, TAGINT_FORMAT " " TAGINT_FORMAT,
me,update->ntimestep, me,update->ntimestep,
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
error->warning(FLERR,str,0); error->warning(FLERR,str,0);
fprintf(screen," 1st atom: %d %g %g %g\n", fprintf(screen," 1st atom: %d %g %g %g\n",
me,x[i1].x,x[i1].y,x[i1].z); me,x[i1].x,x[i1].y,x[i1].z);
fprintf(screen," 2nd atom: %d %g %g %g\n", fprintf(screen," 2nd atom: %d %g %g %g\n",
me,x[i2].x,x[i2].y,x[i2].z); me,x[i2].x,x[i2].y,x[i2].z);
fprintf(screen," 3rd atom: %d %g %g %g\n", fprintf(screen," 3rd atom: %d %g %g %g\n",
me,x[i3].x,x[i3].y,x[i3].z); me,x[i3].x,x[i3].y,x[i3].z);
fprintf(screen," 4th atom: %d %g %g %g\n", fprintf(screen," 4th atom: %d %g %g %g\n",
me,x[i4].x,x[i4].y,x[i4].z); me,x[i4].x,x[i4].y,x[i4].z);
} }
} }
@ -268,35 +268,35 @@ void ImproperCvffIntel::eval(const int vflag,
{ {
if (m == 2) { if (m == 2) {
p = (flt_t)2.0*c*c; p = (flt_t)2.0*c*c;
pd = (flt_t)2.0*c; pd = (flt_t)2.0*c;
} else if (m == 3) { } else if (m == 3) {
const flt_t rc2 = c*c; const flt_t rc2 = c*c;
p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0; p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0;
pd = (flt_t)6.0*rc2 - (flt_t)1.5; pd = (flt_t)6.0*rc2 - (flt_t)1.5;
} else if (m == 4) { } else if (m == 4) {
const flt_t rc2 = c*c; const flt_t rc2 = c*c;
p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0; p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0;
pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c; pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c;
} else if (m == 6) { } else if (m == 6) {
const flt_t rc2 = c*c; const flt_t rc2 = c*c;
p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2; p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2;
pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c; pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c;
} else if (m == 1) { } else if (m == 1) {
p = c + (flt_t)1.0; p = c + (flt_t)1.0;
pd = (flt_t)0.5; pd = (flt_t)0.5;
} else if (m == 5) { } else if (m == 5) {
const flt_t rc2 = c*c; const flt_t rc2 = c*c;
p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0; p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0;
pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5; pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5;
} else if (m == 0) { } else if (m == 0) {
p = (flt_t)2.0; p = (flt_t)2.0;
pd = (flt_t)0.0; pd = (flt_t)0.0;
} }
} }
if (fc.fc[type].sign == -1) { if (fc.fc[type].sign == -1) {
p = (flt_t)2.0 - p; p = (flt_t)2.0 - p;
pd = -pd; pd = -pd;
} }
flt_t eimproper; flt_t eimproper;
@ -340,43 +340,43 @@ void ImproperCvffIntel::eval(const int vflag,
{ {
if (NEWTON_BOND || i1 < nlocal) { if (NEWTON_BOND || i1 < nlocal) {
f[i1].x += f1x; f[i1].x += f1x;
f[i1].y += f1y; f[i1].y += f1y;
f[i1].z += f1z; f[i1].z += f1z;
} }
if (NEWTON_BOND || i2 < nlocal) { if (NEWTON_BOND || i2 < nlocal) {
f[i2].x += f2x; f[i2].x += f2x;
f[i2].y += f2y; f[i2].y += f2y;
f[i2].z += f2z; f[i2].z += f2z;
} }
if (NEWTON_BOND || i3 < nlocal) { if (NEWTON_BOND || i3 < nlocal) {
f[i3].x += f3x; f[i3].x += f3x;
f[i3].y += f3y; f[i3].y += f3y;
f[i3].z += f3z; f[i3].z += f3z;
} }
if (NEWTON_BOND || i4 < nlocal) { if (NEWTON_BOND || i4 < nlocal) {
f[i4].x += f4x; f[i4].x += f4x;
f[i4].y += f4y; f[i4].y += f4y;
f[i4].z += f4z; f[i4].z += f4z;
} }
} }
if (EFLAG || VFLAG) { if (EFLAG || VFLAG) {
#ifdef LMP_INTEL_USE_SIMDOFF_FIX #ifdef LMP_INTEL_USE_SIMDOFF_FIX
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND, vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND,
nlocal, sv0, sv1, sv2, sv3, sv4, sv5); nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
#else #else
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND, vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND,
nlocal, ov0, ov1, ov2, ov3, ov4, ov5); nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
#endif #endif
} }
} // for n } // for n
#ifdef LMP_INTEL_USE_SIMDOFF_FIX #ifdef LMP_INTEL_USE_SIMDOFF_FIX
@ -390,7 +390,7 @@ void ImproperCvffIntel::eval(const int vflag,
if (EFLAG) energy += oeimproper; if (EFLAG) energy += oeimproper;
if (VFLAG && vflag) { if (VFLAG && vflag) {
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
} }
fix->set_reduce_flag(); fix->set_reduce_flag();
@ -428,7 +428,7 @@ void ImproperCvffIntel::init_style()
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc, void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers) IntelBuffers<flt_t,acc_t> *buffers)
{ {
const int bp1 = atom->nimpropertypes + 1; const int bp1 = atom->nimpropertypes + 1;
fc.set_ntypes(bp1,memory); fc.set_ntypes(bp1,memory);
@ -444,11 +444,11 @@ void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc,
template <class flt_t> template <class flt_t>
void ImproperCvffIntel::ForceConst<flt_t>::set_ntypes(const int nimproper, void ImproperCvffIntel::ForceConst<flt_t>::set_ntypes(const int nimproper,
Memory *memory) { Memory *memory) {
if (nimproper != _nimpropertypes) { if (nimproper != _nimpropertypes) {
if (_nimpropertypes > 0) if (_nimpropertypes > 0)
_memory->destroy(fc); _memory->destroy(fc);
if (nimproper > 0) if (nimproper > 0)
_memory->create(fc,nimproper,"improperharmonicintel.fc"); _memory->create(fc,nimproper,"improperharmonicintel.fc");
} }

View File

@ -45,8 +45,8 @@ class ImproperCvffIntel : public ImproperCvff {
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc, void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers); IntelBuffers<flt_t, acc_t> *buffers);

View File

@ -43,7 +43,7 @@ typedef struct { int a,b,c,d,t; } int5_t;
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
ImproperHarmonicIntel::ImproperHarmonicIntel(LAMMPS *lmp) : ImproperHarmonicIntel::ImproperHarmonicIntel(LAMMPS *lmp) :
ImproperHarmonic(lmp) ImproperHarmonic(lmp)
{ {
suffix_flag |= Suffix::INTEL; suffix_flag |= Suffix::INTEL;
@ -81,8 +81,8 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag)
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void ImproperHarmonicIntel::compute(int eflag, int vflag, void ImproperHarmonicIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
if (eflag || vflag) ev_setup(eflag,vflag); if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = 0; else evflag = 0;
@ -90,14 +90,14 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag,
if (evflag) { if (evflag) {
if (vflag && !eflag) { if (vflag && !eflag) {
if (force->newton_bond) if (force->newton_bond)
eval<0,1,1>(vflag, buffers, fc); eval<0,1,1>(vflag, buffers, fc);
else else
eval<0,1,0>(vflag, buffers, fc); eval<0,1,0>(vflag, buffers, fc);
} else { } else {
if (force->newton_bond) if (force->newton_bond)
eval<1,1,1>(vflag, buffers, fc); eval<1,1,1>(vflag, buffers, fc);
else else
eval<1,1,0>(vflag, buffers, fc); eval<1,1,0>(vflag, buffers, fc);
} }
} else { } else {
if (force->newton_bond) if (force->newton_bond)
@ -110,9 +110,9 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag,
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void ImproperHarmonicIntel::eval(const int vflag, void ImproperHarmonicIntel::eval(const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
const int inum = neighbor->nimproperlist; const int inum = neighbor->nimproperlist;
if (inum == 0) return; if (inum == 0) return;
@ -154,7 +154,7 @@ void ImproperHarmonicIntel::eval(const int vflag,
if (fix->need_zero(tid)) if (fix->need_zero(tid))
memset(f, 0, f_stride * sizeof(FORCE_T)); memset(f, 0, f_stride * sizeof(FORCE_T));
const int5_t * _noalias const improperlist = const int5_t * _noalias const improperlist =
(int5_t *) neighbor->improperlist[0]; (int5_t *) neighbor->improperlist[0];
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
@ -221,22 +221,22 @@ void ImproperHarmonicIntel::eval(const int vflag,
#ifndef LMP_INTEL_USE_SIMDOFF #ifndef LMP_INTEL_USE_SIMDOFF
if (c > PTOLERANCE || c < MTOLERANCE) { if (c > PTOLERANCE || c < MTOLERANCE) {
int me; int me;
MPI_Comm_rank(world,&me); MPI_Comm_rank(world,&me);
if (screen) { if (screen) {
char str[128]; char str[128];
sprintf(str,"Improper problem: %d " BIGINT_FORMAT " " sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT, TAGINT_FORMAT " " TAGINT_FORMAT,
me,update->ntimestep, me,update->ntimestep,
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
error->warning(FLERR,str,0); error->warning(FLERR,str,0);
fprintf(screen," 1st atom: %d %g %g %g\n", fprintf(screen," 1st atom: %d %g %g %g\n",
me,x[i1].x,x[i1].y,x[i1].z); me,x[i1].x,x[i1].y,x[i1].z);
fprintf(screen," 2nd atom: %d %g %g %g\n", fprintf(screen," 2nd atom: %d %g %g %g\n",
me,x[i2].x,x[i2].y,x[i2].z); me,x[i2].x,x[i2].y,x[i2].z);
fprintf(screen," 3rd atom: %d %g %g %g\n", fprintf(screen," 3rd atom: %d %g %g %g\n",
me,x[i3].x,x[i3].y,x[i3].z); me,x[i3].x,x[i3].y,x[i3].z);
fprintf(screen," 4th atom: %d %g %g %g\n", fprintf(screen," 4th atom: %d %g %g %g\n",
me,x[i4].x,x[i4].y,x[i4].z); me,x[i4].x,x[i4].y,x[i4].z);
} }
} }
@ -296,43 +296,43 @@ void ImproperHarmonicIntel::eval(const int vflag,
{ {
if (NEWTON_BOND || i1 < nlocal) { if (NEWTON_BOND || i1 < nlocal) {
f[i1].x += f1x; f[i1].x += f1x;
f[i1].y += f1y; f[i1].y += f1y;
f[i1].z += f1z; f[i1].z += f1z;
} }
if (NEWTON_BOND || i2 < nlocal) { if (NEWTON_BOND || i2 < nlocal) {
f[i2].x += f2x; f[i2].x += f2x;
f[i2].y += f2y; f[i2].y += f2y;
f[i2].z += f2z; f[i2].z += f2z;
} }
if (NEWTON_BOND || i3 < nlocal) { if (NEWTON_BOND || i3 < nlocal) {
f[i3].x += f3x; f[i3].x += f3x;
f[i3].y += f3y; f[i3].y += f3y;
f[i3].z += f3z; f[i3].z += f3z;
} }
if (NEWTON_BOND || i4 < nlocal) { if (NEWTON_BOND || i4 < nlocal) {
f[i4].x += f4x; f[i4].x += f4x;
f[i4].y += f4y; f[i4].y += f4y;
f[i4].z += f4z; f[i4].z += f4z;
} }
} }
if (EFLAG || VFLAG) { if (EFLAG || VFLAG) {
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,
f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND, vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND,
nlocal, sv0, sv1, sv2, sv3, sv4, sv5); nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
#else #else
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,
f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND, vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND,
nlocal, ov0, ov1, ov2, ov3, ov4, ov5); nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
#endif #endif
} }
} // for n } // for n
#ifdef LMP_INTEL_USE_SIMDOFF #ifdef LMP_INTEL_USE_SIMDOFF
@ -346,7 +346,7 @@ void ImproperHarmonicIntel::eval(const int vflag,
if (EFLAG) energy += oeimproper; if (EFLAG) energy += oeimproper;
if (VFLAG && vflag) { if (VFLAG && vflag) {
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
} }
fix->set_reduce_flag(); fix->set_reduce_flag();
@ -384,7 +384,7 @@ void ImproperHarmonicIntel::init_style()
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void ImproperHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc, void ImproperHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers) IntelBuffers<flt_t,acc_t> *buffers)
{ {
const int bp1 = atom->nimpropertypes + 1; const int bp1 = atom->nimpropertypes + 1;
fc.set_ntypes(bp1,memory); fc.set_ntypes(bp1,memory);
@ -399,11 +399,11 @@ void ImproperHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
template <class flt_t> template <class flt_t>
void ImproperHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nimproper, void ImproperHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nimproper,
Memory *memory) { Memory *memory) {
if (nimproper != _nimpropertypes) { if (nimproper != _nimpropertypes) {
if (_nimpropertypes > 0) if (_nimpropertypes > 0)
_memory->destroy(fc); _memory->destroy(fc);
if (nimproper > 0) if (nimproper > 0)
_memory->create(fc,nimproper,"improperharmonicintel.fc"); _memory->create(fc,nimproper,"improperharmonicintel.fc");
} }

View File

@ -45,8 +45,8 @@ class ImproperHarmonicIntel : public ImproperHarmonic {
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc, void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers); IntelBuffers<flt_t, acc_t> *buffers);

View File

@ -71,8 +71,8 @@ void IntelBuffers<flt_t, acc_t>::free_buffers()
if (ev_global != 0) { if (ev_global != 0) {
#pragma offload_transfer target(mic:_cop) \ #pragma offload_transfer target(mic:_cop) \
nocopy(x:alloc_if(0) free_if(1)) \ nocopy(x:alloc_if(0) free_if(1)) \
nocopy(f_start:alloc_if(0) free_if(1)) \ nocopy(f_start:alloc_if(0) free_if(1)) \
nocopy(ev_global:alloc_if(0) free_if(1)) nocopy(ev_global:alloc_if(0) free_if(1))
} }
if (q != 0) { if (q != 0) {
@ -105,8 +105,8 @@ void IntelBuffers<flt_t, acc_t>::free_buffers()
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal, void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
const int nthreads, const int nthreads,
const int offload_end) const int offload_end)
{ {
free_buffers(); free_buffers();
_buf_size = static_cast<double>(nall) * 1.1 + 1; _buf_size = static_cast<double>(nall) * 1.1 + 1;
@ -151,15 +151,15 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) { if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) {
#pragma offload_transfer target(mic:_cop) \ #pragma offload_transfer target(mic:_cop) \
nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \ nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \
nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\ nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
nocopy(ev_global:length(8) alloc_if(1) free_if(0)) nocopy(ev_global:length(8) alloc_if(1) free_if(0))
} }
} else { } else {
if (x != NULL && f_start != NULL && ev_global != NULL) { if (x != NULL && f_start != NULL && ev_global != NULL) {
#pragma offload_transfer target(mic:_cop) \ #pragma offload_transfer target(mic:_cop) \
nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \ nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \
nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\ nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
nocopy(ev_global:length(8) alloc_if(1) free_if(0)) nocopy(ev_global:length(8) alloc_if(1) free_if(0))
} }
} }
if (lmp->atom->ellipsoid != NULL) { if (lmp->atom->ellipsoid != NULL) {
@ -186,7 +186,7 @@ void IntelBuffers<flt_t, acc_t>::free_nmax()
if (tag != 0 && special != 0 && nspecial !=0) { if (tag != 0 && special != 0 && nspecial !=0) {
#pragma offload_transfer target(mic:_cop) \ #pragma offload_transfer target(mic:_cop) \
nocopy(tag:alloc_if(0) free_if(1)) \ nocopy(tag:alloc_if(0) free_if(1)) \
nocopy(special,nspecial:alloc_if(0) free_if(1)) nocopy(special,nspecial:alloc_if(0) free_if(1))
} }
_off_map_nmax = 0; _off_map_nmax = 0;
_host_nmax = 0; _host_nmax = 0;
@ -261,7 +261,7 @@ void IntelBuffers<flt_t, acc_t>::free_list_local()
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list, void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list,
const int offload_end) const int offload_end)
{ {
free_list_local(); free_list_local();
int size = list->get_maxlocal(); int size = list->get_maxlocal();
@ -276,7 +276,7 @@ void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list,
if (cnumneigh != 0) { if (cnumneigh != 0) {
#pragma offload_transfer target(mic:_cop) \ #pragma offload_transfer target(mic:_cop) \
nocopy(ilist:length(size) alloc_if(1) free_if(0)) \ nocopy(ilist:length(size) alloc_if(1) free_if(0)) \
nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \ nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \
nocopy(cnumneigh:length(size) alloc_if(1) free_if(0)) nocopy(cnumneigh:length(size) alloc_if(1) free_if(0))
} }
_off_map_ilist = ilist; _off_map_ilist = ilist;
@ -309,14 +309,14 @@ void IntelBuffers<flt_t, acc_t>::free_nbor_list()
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list, void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
const int nlocal, const int nlocal,
const int nthreads, const int nthreads,
const int offload_end, const int offload_end,
const int pack_width) const int pack_width)
{ {
free_nbor_list(); free_nbor_list();
_list_alloc_atoms = 1.10 * nlocal; _list_alloc_atoms = 1.10 * nlocal;
int nt = MAX(nthreads, _off_threads); int nt = MAX(nthreads, _off_threads);
int list_alloc_size = (_list_alloc_atoms + nt * 2 + pack_width - 1) * int list_alloc_size = (_list_alloc_atoms + nt * 2 + pack_width - 1) *
get_max_nbors(); get_max_nbors();
lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc"); lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc");
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
@ -380,8 +380,8 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag, void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
const int nthreads, const int nthreads,
const int width) const int width)
{ {
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
if (_ccachex && off_flag && _off_ccache == 0) if (_ccachex && off_flag && _off_ccache == 0)
@ -418,7 +418,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
int *ccachej = _ccachej; int *ccachej = _ccachej;
if (ccachex != NULL && ccachey !=NULL && ccachez != NULL && if (ccachex != NULL && ccachey !=NULL && ccachez != NULL &&
ccachew != NULL && ccachei != NULL && ccachej !=NULL) { ccachew != NULL && ccachei != NULL && ccachej !=NULL) {
#pragma offload_transfer target(mic:_cop) \ #pragma offload_transfer target(mic:_cop) \
nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \ nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \ nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
@ -471,7 +471,7 @@ void IntelBuffers<flt_t, acc_t>::free_ncache()
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag, void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
const int nthreads) const int nthreads)
{ {
const int nsize = get_max_nbors() * 3; const int nsize = get_max_nbors() * 3;
int esize = MIN(sizeof(int), sizeof(flt_t)); int esize = MIN(sizeof(int), sizeof(flt_t));
@ -507,7 +507,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
int *ncachejtype = _ncachejtype; int *ncachejtype = _ncachejtype;
if (ncachex != NULL && ncachey !=NULL && ncachez != NULL && if (ncachex != NULL && ncachey !=NULL && ncachez != NULL &&
ncachej != NULL && ncachejtype != NULL) { ncachej != NULL && ncachejtype != NULL) {
#pragma offload_transfer target(mic:_cop) \ #pragma offload_transfer target(mic:_cop) \
nocopy(ncachex,ncachey:length(vsize) alloc_if(1) free_if(0)) \ nocopy(ncachex,ncachey:length(vsize) alloc_if(1) free_if(0)) \
nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \ nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \
@ -522,9 +522,9 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
#ifndef _LMP_INTEL_OFFLOAD #ifndef _LMP_INTEL_OFFLOAD
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt, void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt,
const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1, const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5) acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
{ {
IP_PRE_fdotr_acc_force_l5(lf, lt, 0, nthreads, _f, f_stride, _x, ov0, IP_PRE_fdotr_acc_force_l5(lf, lt, 0, nthreads, _f, f_stride, _x, ov0,
ov1, ov2, ov3, ov4, ov5); ov1, ov2, ov3, ov4, ov5);
@ -535,13 +535,13 @@ void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt,
#ifndef _LMP_INTEL_OFFLOAD #ifndef _LMP_INTEL_OFFLOAD
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall, void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall,
const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1, const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5) acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
{ {
int iifrom, iito, tid; int iifrom, iito, tid;
IP_PRE_fdotr_acc_force(nall, 0, nthreads, _f, f_stride, _x, 0, 2, IP_PRE_fdotr_acc_force(nall, 0, nthreads, _f, f_stride, _x, 0, 2,
ov0, ov1, ov2, ov3, ov4, ov5); ov0, ov1, ov2, ov3, ov4, ov5);
} }
#endif #endif

View File

@ -62,7 +62,7 @@ class IntelBuffers {
void free_buffers(); void free_buffers();
void free_nmax(); void free_nmax();
inline void set_bininfo(int *atombin, int *binpacked) inline void set_bininfo(int *atombin, int *binpacked)
{ _atombin = atombin; _binpacked = binpacked; } { _atombin = atombin; _binpacked = binpacked; }
inline void grow(const int nall, const int nlocal, const int nthreads, inline void grow(const int nall, const int nlocal, const int nthreads,
const int offload_end) { const int offload_end) {
@ -126,7 +126,7 @@ class IntelBuffers {
inline void grow_nbor_list(NeighList *list, const int nlocal, inline void grow_nbor_list(NeighList *list, const int nlocal,
const int nthreads, const int offload_end, const int nthreads, const int offload_end,
const int pack_width) { const int pack_width) {
if (nlocal > _list_alloc_atoms) if (nlocal > _list_alloc_atoms)
_grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width); _grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
} }
@ -165,7 +165,7 @@ class IntelBuffers {
inline int get_off_threads() { return _off_threads; } inline int get_off_threads() { return _off_threads; }
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
inline void set_off_params(const int n, const int cop, inline void set_off_params(const int n, const int cop,
const int separate_buffers) const int separate_buffers)
{ _off_threads = n; _cop = cop; _separate_buffers = separate_buffers; } { _off_threads = n; _cop = cop; _separate_buffers = separate_buffers; }
inline vec3_acc_t * get_off_f() { return _off_f; } inline vec3_acc_t * get_off_f() { return _off_f; }
#endif #endif
@ -191,17 +191,17 @@ class IntelBuffers {
} }
#ifndef _LMP_INTEL_OFFLOAD #ifndef _LMP_INTEL_OFFLOAD
void fdotr_reduce_l5(const int lf, const int lt, const int nthreads, void fdotr_reduce_l5(const int lf, const int lt, const int nthreads,
const int f_stride, acc_t &ov0, acc_t &ov1, const int f_stride, acc_t &ov0, acc_t &ov1,
acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5); acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5);
void fdotr_reduce(const int nall, const int nthreads, const int f_stride, void fdotr_reduce(const int nall, const int nthreads, const int f_stride,
acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3, acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3,
acc_t &ov4, acc_t &ov5); acc_t &ov4, acc_t &ov5);
#endif #endif
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
inline void thr_pack_cop(const int ifrom, const int ito, inline void thr_pack_cop(const int ifrom, const int ito,
const int offset, const bool dotype = false) { const int offset, const bool dotype = false) {
double ** x = lmp->atom->x + offset; double ** x = lmp->atom->x + offset;
if (dotype == false) { if (dotype == false) {
#pragma vector nontemporal #pragma vector nontemporal
@ -214,16 +214,16 @@ class IntelBuffers {
int *type = lmp->atom->type + offset; int *type = lmp->atom->type + offset;
#pragma vector nontemporal #pragma vector nontemporal
for (int i = ifrom; i < ito; i++) { for (int i = ifrom; i < ito; i++) {
_x[i].x = x[i][0]; _x[i].x = x[i][0];
_x[i].y = x[i][1]; _x[i].y = x[i][1];
_x[i].z = x[i][2]; _x[i].z = x[i][2];
_x[i].w = type[i]; _x[i].w = type[i];
} }
} }
} }
inline void thr_pack_host(const int ifrom, const int ito, inline void thr_pack_host(const int ifrom, const int ito,
const int offset) { const int offset) {
double ** x = lmp->atom->x + offset; double ** x = lmp->atom->x + offset;
for (int i = ifrom; i < ito; i++) { for (int i = ifrom; i < ito; i++) {
_host_x[i].x = x[i][0]; _host_x[i].x = x[i][0];
@ -233,13 +233,13 @@ class IntelBuffers {
} }
inline void pack_sep_from_single(const int host_min_local, inline void pack_sep_from_single(const int host_min_local,
const int used_local, const int used_local,
const int host_min_ghost, const int host_min_ghost,
const int used_ghost) { const int used_ghost) {
memcpy(_host_x + host_min_local, _x + host_min_local, memcpy(_host_x + host_min_local, _x + host_min_local,
used_local * sizeof(atom_t)); used_local * sizeof(atom_t));
memcpy(_host_x + host_min_local + used_local, _x + host_min_ghost, memcpy(_host_x + host_min_local + used_local, _x + host_min_ghost,
used_ghost * sizeof(atom_t)); used_ghost * sizeof(atom_t));
int nall = used_local + used_ghost + host_min_local; int nall = used_local + used_ghost + host_min_local;
_host_x[nall].x = INTEL_BIGP; _host_x[nall].x = INTEL_BIGP;
_host_x[nall].y = INTEL_BIGP; _host_x[nall].y = INTEL_BIGP;
@ -247,9 +247,9 @@ class IntelBuffers {
_host_x[nall].w = 1; _host_x[nall].w = 1;
if (lmp->atom->q != NULL) { if (lmp->atom->q != NULL) {
memcpy(_host_q + host_min_local, _q + host_min_local, memcpy(_host_q + host_min_local, _q + host_min_local,
used_local * sizeof(flt_t)); used_local * sizeof(flt_t));
memcpy(_host_q + host_min_local + used_local, _q + host_min_ghost, memcpy(_host_q + host_min_local + used_local, _q + host_min_ghost,
used_ghost * sizeof(flt_t)); used_ghost * sizeof(flt_t));
} }
} }
@ -310,7 +310,7 @@ class IntelBuffers {
_alignvar(acc_t _ev_global_host[8],64); _alignvar(acc_t _ev_global_host[8],64);
void _grow(const int nall, const int nlocal, const int nthreads, void _grow(const int nall, const int nlocal, const int nthreads,
const int offload_end); const int offload_end);
void _grow_nmax(const int offload_end); void _grow_nmax(const int offload_end);
void _grow_list_local(NeighList *list, const int offload_end); void _grow_list_local(NeighList *list, const int offload_end);
void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads, void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads,

View File

@ -46,23 +46,23 @@ struct lmp_intel_an_fvec {
lmp_intel_an_fvec(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; } lmp_intel_an_fvec(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; }
lmp_intel_an_fvec& operator =(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; return *this; } lmp_intel_an_fvec& operator =(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; return *this; }
const lmp_intel_an_fvec operator +(const lmp_intel_an_fvec &b) const { const lmp_intel_an_fvec operator +(const lmp_intel_an_fvec &b) const {
lmp_intel_an_fvec ret = *this; lmp_intel_an_fvec ret = *this;
ret.data[:] += b.data[:]; ret.data[:] += b.data[:];
return ret; return ret;
} }
const lmp_intel_an_fvec operator -(const lmp_intel_an_fvec &b) const { const lmp_intel_an_fvec operator -(const lmp_intel_an_fvec &b) const {
lmp_intel_an_fvec ret = *this; lmp_intel_an_fvec ret = *this;
ret.data[:] -= b.data[:]; ret.data[:] -= b.data[:];
return ret; return ret;
} }
const lmp_intel_an_fvec operator *(const lmp_intel_an_fvec &b) const { const lmp_intel_an_fvec operator *(const lmp_intel_an_fvec &b) const {
lmp_intel_an_fvec ret = *this; lmp_intel_an_fvec ret = *this;
ret.data[:] *= b.data[:]; ret.data[:] *= b.data[:];
return ret; return ret;
} }
const lmp_intel_an_fvec operator /(const lmp_intel_an_fvec &b) const { const lmp_intel_an_fvec operator /(const lmp_intel_an_fvec &b) const {
lmp_intel_an_fvec ret = *this; lmp_intel_an_fvec ret = *this;
ret.data[:] /= b.data[:]; ret.data[:] /= b.data[:];
return ret; return ret;
} }
lmp_intel_an_fvec& operator +=(const lmp_intel_an_fvec &b) { lmp_intel_an_fvec& operator +=(const lmp_intel_an_fvec &b) {
@ -103,18 +103,18 @@ struct lmp_intel_an_ivec {
explicit lmp_intel_an_ivec(int i) { data[:] = i; } explicit lmp_intel_an_ivec(int i) { data[:] = i; }
explicit lmp_intel_an_ivec(const int * a) { data[:] = a[0:VL]; } explicit lmp_intel_an_ivec(const int * a) { data[:] = a[0:VL]; }
const lmp_intel_an_ivec operator &(const lmp_intel_an_ivec &b) { const lmp_intel_an_ivec operator &(const lmp_intel_an_ivec &b) {
lmp_intel_an_ivec ret = *this; lmp_intel_an_ivec ret = *this;
ret.data[:] &= b.data[:]; ret.data[:] &= b.data[:];
return ret; return ret;
} }
const lmp_intel_an_ivec operator |(const lmp_intel_an_ivec &b) { const lmp_intel_an_ivec operator |(const lmp_intel_an_ivec &b) {
lmp_intel_an_ivec ret = *this; lmp_intel_an_ivec ret = *this;
ret.data[:] |= b.data[:]; ret.data[:] |= b.data[:];
return ret; return ret;
} }
const lmp_intel_an_ivec operator +(const lmp_intel_an_ivec &b) { const lmp_intel_an_ivec operator +(const lmp_intel_an_ivec &b) {
lmp_intel_an_ivec ret = *this; lmp_intel_an_ivec ret = *this;
ret.data[:] += b.data[:]; ret.data[:] += b.data[:];
return ret; return ret;
} }
}; };
@ -171,13 +171,13 @@ enum CalculationMode { KNC, AVX, AVX2, SSE, NONE, AN };
// This is used in the selection logic // This is used in the selection logic
template<CalculationMode mode> template<CalculationMode mode>
struct vector_traits { struct vector_traits {
static const bool support_integer_and_gather_ops = true; static const bool support_integer_and_gather_ops = true;
}; };
template<> template<>
struct vector_traits<AVX> { struct vector_traits<AVX> {
static const bool support_integer_and_gather_ops = false; static const bool support_integer_and_gather_ops = false;
}; };
// This is the base template for all the different architectures // This is the base template for all the different architectures
@ -198,10 +198,10 @@ struct ivec32x16 {
} }
explicit ivec32x16(int i) { vec = _mm512_set1_epi32(i); } explicit ivec32x16(int i) { vec = _mm512_set1_epi32(i); }
operator __m512i() const { return vec; } operator __m512i() const { return vec; }
friend ivec32x16 operator &(const ivec32x16 &a, const ivec32x16 &b) { friend ivec32x16 operator &(const ivec32x16 &a, const ivec32x16 &b) {
return _mm512_and_epi32(a, b); return _mm512_and_epi32(a, b);
} }
friend ivec32x16 operator |(const ivec32x16 &a, const ivec32x16 &b) { friend ivec32x16 operator |(const ivec32x16 &a, const ivec32x16 &b) {
return _mm512_or_epi32(a, b); return _mm512_or_epi32(a, b);
} }
friend ivec32x16 operator +(const ivec32x16 &a, const ivec32x16 &b) { friend ivec32x16 operator +(const ivec32x16 &a, const ivec32x16 &b) {
@ -326,7 +326,7 @@ struct vector_ops<double, KNC> {
*z = gather<1>(*z, mask, idxs, &base->z); *z = gather<1>(*z, mask, idxs, &base->z);
*w = int_gather<1>(*w, mask, idxs, &base->w); *w = int_gather<1>(*w, mask, idxs, &base->w);
} }
static void gather_8(const ivec &idxs, const bvec &mask, const void *base, static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0); *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 8); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 8);
@ -337,7 +337,7 @@ struct vector_ops<double, KNC> {
*r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 48); *r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 48);
*r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 56); *r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 56);
} }
static void gather_4(const ivec &idxs, const bvec &mask, const void *base, static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) { fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0); *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 8); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 8);
@ -464,7 +464,7 @@ struct vector_ops<float, KNC> {
*z = gather<1>(*z, mask, idxs, &base->z); *z = gather<1>(*z, mask, idxs, &base->z);
*w = int_gather<1>(*w, mask, idxs, &base->w); *w = int_gather<1>(*w, mask, idxs, &base->w);
} }
static void gather_8(const ivec &idxs, const bvec &mask, const void *base, static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0); *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
@ -475,7 +475,7 @@ struct vector_ops<float, KNC> {
*r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 24); *r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 24);
*r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 28); *r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 28);
} }
static void gather_4(const ivec &idxs, const bvec &mask, const void *base, static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) { fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0); *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
@ -519,10 +519,10 @@ struct ivec32x8 {
} }
explicit ivec32x8(int i) { vec = _mm256_set1_epi32(i); } explicit ivec32x8(int i) { vec = _mm256_set1_epi32(i); }
operator __m256i() const { return vec; } operator __m256i() const { return vec; }
friend ivec32x8 operator &(const ivec32x8 &a, const ivec32x8 &b) { friend ivec32x8 operator &(const ivec32x8 &a, const ivec32x8 &b) {
return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b))); return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
} }
friend ivec32x8 operator |(const ivec32x8 &a, const ivec32x8 &b) { friend ivec32x8 operator |(const ivec32x8 &a, const ivec32x8 &b) {
return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b))); return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
} }
friend ivec32x8 operator +(const ivec32x8 &a, const ivec32x8 &b) { friend ivec32x8 operator +(const ivec32x8 &a, const ivec32x8 &b) {
@ -545,10 +545,10 @@ struct avx_bvec {
operator F64vec4() const { return _mm256_castsi256_pd(vec); } operator F64vec4() const { return _mm256_castsi256_pd(vec); }
operator F32vec8() const { return _mm256_castsi256_ps(vec); } operator F32vec8() const { return _mm256_castsi256_ps(vec); }
operator ivec32x8() const { return vec; } operator ivec32x8() const { return vec; }
friend avx_bvec operator &(const avx_bvec &a, const avx_bvec &b) { friend avx_bvec operator &(const avx_bvec &a, const avx_bvec &b) {
return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b))); return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
} }
friend avx_bvec operator |(const avx_bvec &a, const avx_bvec &b) { friend avx_bvec operator |(const avx_bvec &a, const avx_bvec &b) {
return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b))); return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
} }
friend avx_bvec operator ~(const avx_bvec &a) { return _mm256_castpd_si256(_mm256_andnot_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(avx_bvec(0xFFFFFFFF)))); } friend avx_bvec operator ~(const avx_bvec &a) { return _mm256_castpd_si256(_mm256_andnot_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(avx_bvec(0xFFFFFFFF)))); }
@ -582,8 +582,8 @@ struct vector_ops<double, AVX> {
_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx); _mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
_mm256_store_pd(reinterpret_cast<double*>(src), from); _mm256_store_pd(reinterpret_cast<double*>(src), from);
for (int i = 0; i < VL; i++) { for (int i = 0; i < VL; i++) {
result[i] = mask_test_at(mask, i) result[i] = mask_test_at(mask, i)
? *reinterpret_cast<const double*>(reinterpret_cast<const char*>(base) + scale * idxs[2*i]) ? *reinterpret_cast<const double*>(reinterpret_cast<const char*>(base) + scale * idxs[2*i])
: src[i]; : src[i];
} }
return _mm256_load_pd(reinterpret_cast<double*>(result)); return _mm256_load_pd(reinterpret_cast<double*>(result));
@ -605,18 +605,18 @@ struct vector_ops<double, AVX> {
__m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20); __m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20);
__m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31); __m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31);
__m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31); __m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31);
*x = blend(mask, *x, c0); *x = blend(mask, *x, c0);
*y = blend(mask, *y, c1); *y = blend(mask, *y, c1);
*z = blend(mask, *z, c2); *z = blend(mask, *z, c2);
*w = int_blend(mask, *w, _mm256_castps_si256(_mm256_permute_ps(_mm256_castpd_ps(c3), 0xA0))); *w = int_blend(mask, *w, _mm256_castps_si256(_mm256_permute_ps(_mm256_castpd_ps(c3), 0xA0)));
} }
static void gather_8(const ivec &idxs, const bvec &mask, const void *base, static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
fvec a = zero(), b = zero(), c = zero(), d = zero(); fvec a = zero(), b = zero(), c = zero(), d = zero();
gather_4(idxs, mask, base, r0, r1, r2, r3); gather_4(idxs, mask, base, r0, r1, r2, r3);
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7); gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
} }
static void gather_4(const ivec &idxs, const bvec &mask, const void *base, static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) { fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
iarr i, m; iarr i, m;
_mm256_store_si256(reinterpret_cast<__m256i*>(i), idxs); _mm256_store_si256(reinterpret_cast<__m256i*>(i), idxs);
@ -642,10 +642,10 @@ struct vector_ops<double, AVX> {
__m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20); __m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20);
__m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31); __m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31);
__m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31); __m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31);
*r0 = blend(mask, *r0, c0); *r0 = blend(mask, *r0, c0);
*r1 = blend(mask, *r1, c1); *r1 = blend(mask, *r1, c1);
*r2 = blend(mask, *r2, c2); *r2 = blend(mask, *r2, c2);
*r3 = blend(mask, *r3, c3); *r3 = blend(mask, *r3, c3);
} }
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) { static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
return (b & mask) | (a & ~ mask); return (b & mask) | (a & ~ mask);
@ -809,8 +809,8 @@ struct vector_ops<float, AVX> {
_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx); _mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
_mm256_store_ps(reinterpret_cast<float*>(src), from); _mm256_store_ps(reinterpret_cast<float*>(src), from);
for (int i = 0; i < VL; i++) { for (int i = 0; i < VL; i++) {
result[i] = mask_test_at(mask, i) result[i] = mask_test_at(mask, i)
? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i]) ? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
: src[i]; : src[i];
} }
return _mm256_load_ps(reinterpret_cast<float*>(result)); return _mm256_load_ps(reinterpret_cast<float*>(result));
@ -842,18 +842,18 @@ struct vector_ops<float, AVX> {
__m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE); __m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE);
__m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44); __m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44);
__m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE); __m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE);
*x = blend(mask, *x, c0); *x = blend(mask, *x, c0);
*y = blend(mask, *y, c1); *y = blend(mask, *y, c1);
*z = blend(mask, *z, c2); *z = blend(mask, *z, c2);
*w = int_blend(mask, *w, _mm256_castps_si256(c3)); *w = int_blend(mask, *w, _mm256_castps_si256(c3));
} }
static void gather_8(const ivec &idxs, const bvec &mask, const void *base, static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
fvec a = zero(), b = zero(), c = zero(), d = zero(); fvec a = zero(), b = zero(), c = zero(), d = zero();
gather_4(idxs, mask, base, r0, r1, r2, r3); gather_4(idxs, mask, base, r0, r1, r2, r3);
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7); gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7);
} }
static void gather_4(const ivec &idxs, const bvec &mask, const void *base, static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) { fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
iarr i, m; iarr i, m;
int_store(i, idxs); int_store(i, idxs);
@ -880,10 +880,10 @@ struct vector_ops<float, AVX> {
__m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE); __m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE);
__m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44); __m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44);
__m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE); __m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE);
*r0 = blend(mask, *r0, c0); *r0 = blend(mask, *r0, c0);
*r1 = blend(mask, *r1, c1); *r1 = blend(mask, *r1, c1);
*r2 = blend(mask, *r2, c2); *r2 = blend(mask, *r2, c2);
*r3 = blend(mask, *r3, c3); *r3 = blend(mask, *r3, c3);
} }
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) { static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
return (b & mask) | (a & ~ mask); return (b & mask) | (a & ~ mask);
@ -961,8 +961,8 @@ struct vector_ops<float, AVX> {
_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx); _mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
_mm256_store_si256(reinterpret_cast<__m256i*>(src), from); _mm256_store_si256(reinterpret_cast<__m256i*>(src), from);
for (int i = 0; i < VL; i++) { for (int i = 0; i < VL; i++) {
result[i] = mask_test_at(mask, i) result[i] = mask_test_at(mask, i)
? *reinterpret_cast<const int*>(reinterpret_cast<const char*>(base) + scale * idxs[i]) ? *reinterpret_cast<const int*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
: src[i]; : src[i];
} }
return _mm256_load_si256(reinterpret_cast<__m256i*>(result)); return _mm256_load_si256(reinterpret_cast<__m256i*>(result));
@ -1038,10 +1038,10 @@ struct avx2_ivec32 {
} }
explicit avx2_ivec32(int i) { vec = _mm256_set1_epi32(i); } explicit avx2_ivec32(int i) { vec = _mm256_set1_epi32(i); }
operator __m256i() const { return vec; } operator __m256i() const { return vec; }
friend avx2_ivec32 operator &(const avx2_ivec32 &a, const avx2_ivec32 &b) { friend avx2_ivec32 operator &(const avx2_ivec32 &a, const avx2_ivec32 &b) {
return _mm256_and_si256(a, b); return _mm256_and_si256(a, b);
} }
friend avx2_ivec32 operator |(const avx2_ivec32 &a, const avx2_ivec32 &b) { friend avx2_ivec32 operator |(const avx2_ivec32 &a, const avx2_ivec32 &b) {
return _mm256_or_si256(a, b); return _mm256_or_si256(a, b);
} }
friend avx2_ivec32 operator +(const avx2_ivec32 &a, const avx2_ivec32 &b) { friend avx2_ivec32 operator +(const avx2_ivec32 &a, const avx2_ivec32 &b) {
@ -1060,14 +1060,14 @@ struct avx2_bvec {
operator F64vec4() const { return _mm256_castsi256_pd(vec); } operator F64vec4() const { return _mm256_castsi256_pd(vec); }
operator F32vec8() const { return _mm256_castsi256_ps(vec); } operator F32vec8() const { return _mm256_castsi256_ps(vec); }
operator avx2_ivec32() const { return vec; } operator avx2_ivec32() const { return vec; }
friend avx2_bvec operator &(const avx2_bvec &a, const avx2_bvec &b) { friend avx2_bvec operator &(const avx2_bvec &a, const avx2_bvec &b) {
return _mm256_and_si256(a, b); return _mm256_and_si256(a, b);
} }
friend avx2_bvec operator |(const avx2_bvec &a, const avx2_bvec &b) { friend avx2_bvec operator |(const avx2_bvec &a, const avx2_bvec &b) {
return _mm256_or_si256(a, b); return _mm256_or_si256(a, b);
} }
friend avx2_bvec operator ~(const avx2_bvec &a) { friend avx2_bvec operator ~(const avx2_bvec &a) {
return _mm256_andnot_si256(a, avx2_bvec(0xFFFFFFFF)); return _mm256_andnot_si256(a, avx2_bvec(0xFFFFFFFF));
} }
avx2_bvec& operator &=(const avx2_bvec &a) { return *this = _mm256_and_si256(vec,a); } avx2_bvec& operator &=(const avx2_bvec &a) { return *this = _mm256_and_si256(vec,a); }
}; };
@ -1106,13 +1106,13 @@ struct vector_ops<double, AVX2> {
*z = _mm256_mask_i32gather_pd(*z, &base->z, _mm256_castsi256_si128(idx1), mask, 1); *z = _mm256_mask_i32gather_pd(*z, &base->z, _mm256_castsi256_si128(idx1), mask, 1);
*w = _mm256_mask_i32gather_epi32(*w, &base->w, idx, mask, 1); *w = _mm256_mask_i32gather_epi32(*w, &base->w, idx, mask, 1);
} }
static void gather_8(const ivec &idxs, const bvec &mask, const void *base, static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
fvec a = zero(), b = zero(), c = zero(), d = zero(); fvec a = zero(), b = zero(), c = zero(), d = zero();
gather_4(idxs, mask, base, r0, r1, r2, r3); gather_4(idxs, mask, base, r0, r1, r2, r3);
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7); gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
} }
static void gather_4(const ivec &idx, const bvec &mask, const void *base, static void gather_4(const ivec &idx, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) { fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
ivec idx0 = _mm256_shuffle_epi32(idx, 0xD8); // 11011000 ->3120 ivec idx0 = _mm256_shuffle_epi32(idx, 0xD8); // 11011000 ->3120
ivec idx1 = _mm256_permute4x64_epi64(idx0, 0xD8); ivec idx1 = _mm256_permute4x64_epi64(idx0, 0xD8);
@ -1253,7 +1253,7 @@ struct vector_ops<float, AVX2> {
*z = _mm256_mask_i32gather_ps(*z, reinterpret_cast<const float*>(base) + 2, idx, mask, 1); *z = _mm256_mask_i32gather_ps(*z, reinterpret_cast<const float*>(base) + 2, idx, mask, 1);
*w = _mm256_mask_i32gather_epi32(*w, reinterpret_cast<const int*>(base) + 3, idx, mask, 1); *w = _mm256_mask_i32gather_epi32(*w, reinterpret_cast<const int*>(base) + 3, idx, mask, 1);
} }
static void gather_8(const ivec &idxs, const bvec &mask, const void *base, static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0); *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
@ -1264,7 +1264,7 @@ struct vector_ops<float, AVX2> {
*r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 24); *r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 24);
*r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 28); *r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 28);
} }
static void gather_4(const ivec &idxs, const bvec &mask, const void *base, static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) { fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0); *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
@ -1401,10 +1401,10 @@ struct ivec32x4 {
} }
explicit ivec32x4(int i) { vec = _mm_set1_epi32(i); } explicit ivec32x4(int i) { vec = _mm_set1_epi32(i); }
operator __m128i() const { return vec; } operator __m128i() const { return vec; }
friend ivec32x4 operator &(const ivec32x4 &a, const ivec32x4 &b) { friend ivec32x4 operator &(const ivec32x4 &a, const ivec32x4 &b) {
return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b))); return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
} }
friend ivec32x4 operator |(const ivec32x4 &a, const ivec32x4 &b) { friend ivec32x4 operator |(const ivec32x4 &a, const ivec32x4 &b) {
return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b))); return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
} }
friend ivec32x4 operator +(const ivec32x4 &a, const ivec32x4 &b) { friend ivec32x4 operator +(const ivec32x4 &a, const ivec32x4 &b) {
@ -1420,10 +1420,10 @@ struct sse_bvecx4 {
operator __m128i() const { return vec; } operator __m128i() const { return vec; }
operator F64vec2() const { return _mm_castsi128_pd(vec); } operator F64vec2() const { return _mm_castsi128_pd(vec); }
operator ivec32x4() const { return vec; } operator ivec32x4() const { return vec; }
friend sse_bvecx4 operator &(const sse_bvecx4 &a, const sse_bvecx4 &b) { friend sse_bvecx4 operator &(const sse_bvecx4 &a, const sse_bvecx4 &b) {
return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b))); return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
} }
friend sse_bvecx4 operator |(const sse_bvecx4 &a, const sse_bvecx4 &b) { friend sse_bvecx4 operator |(const sse_bvecx4 &a, const sse_bvecx4 &b) {
return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b))); return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
} }
friend sse_bvecx4 operator ~(const sse_bvecx4 &a) { return _mm_castpd_si128(_mm_andnot_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(sse_bvecx4(0xFFFFFFFF)))); } friend sse_bvecx4 operator ~(const sse_bvecx4 &a) { return _mm_castpd_si128(_mm_andnot_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(sse_bvecx4(0xFFFFFFFF)))); }
@ -1477,18 +1477,18 @@ struct vector_ops<double, SSE> {
__m128d c1 = _mm_unpackhi_pd(a0lo, a1lo); __m128d c1 = _mm_unpackhi_pd(a0lo, a1lo);
__m128d c2 = _mm_unpacklo_pd(a0hi, a1hi); __m128d c2 = _mm_unpacklo_pd(a0hi, a1hi);
__m128d c3 = _mm_unpackhi_pd(a0hi, a1hi); __m128d c3 = _mm_unpackhi_pd(a0hi, a1hi);
*x = blend(mask, *x, c0); *x = blend(mask, *x, c0);
*y = blend(mask, *y, c1); *y = blend(mask, *y, c1);
*z = blend(mask, *z, c2); *z = blend(mask, *z, c2);
*w = int_blend(mask, *w, _mm_shuffle_epi32(_mm_castpd_si128(c3), 0xA0)); *w = int_blend(mask, *w, _mm_shuffle_epi32(_mm_castpd_si128(c3), 0xA0));
} }
static void gather_8(const ivec &idxs, const bvec &mask, const void *base, static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
fvec a = zero(), b = zero(), c = zero(), d = zero(); fvec a = zero(), b = zero(), c = zero(), d = zero();
gather_4(idxs, mask, base, r0, r1, r2, r3); gather_4(idxs, mask, base, r0, r1, r2, r3);
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7); gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
} }
static void gather_4(const ivec &idxs, const bvec &mask, const void *base, static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) { fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0); *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0);
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 8); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 8);
@ -1634,8 +1634,8 @@ struct vector_ops<float, SSE> {
_mm_store_si128(reinterpret_cast<__m128i*>(idxs), idx); _mm_store_si128(reinterpret_cast<__m128i*>(idxs), idx);
_mm_store_ps(reinterpret_cast<float*>(src), from); _mm_store_ps(reinterpret_cast<float*>(src), from);
for (int i = 0; i < VL; i++) { for (int i = 0; i < VL; i++) {
result[i] = m[i] result[i] = m[i]
? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i]) ? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
: src[i]; : src[i];
} }
return _mm_load_ps(reinterpret_cast<float*>(result)); return _mm_load_ps(reinterpret_cast<float*>(result));
@ -1647,13 +1647,13 @@ struct vector_ops<float, SSE> {
*z = gather<1>(*z, mask, idxs, &base->z); *z = gather<1>(*z, mask, idxs, &base->z);
*w = int_gather<1>(*w, mask, idxs, &base->w); *w = int_gather<1>(*w, mask, idxs, &base->w);
} }
static void gather_8(const ivec &idxs, const bvec &mask, const void *base, static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
fvec a = zero(), b = zero(), c = zero(), d = zero(); fvec a = zero(), b = zero(), c = zero(), d = zero();
gather_4(idxs, mask, base, r0, r1, r2, r3); gather_4(idxs, mask, base, r0, r1, r2, r3);
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7); gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7);
} }
static void gather_4(const ivec &idxs, const bvec &mask, const void *base, static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) { fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0); *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0);
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 4); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 4);
@ -1816,13 +1816,13 @@ struct vector_ops<flt_t, NONE> {
*z = gather<1>(*z, mask, idxs, &base->z); *z = gather<1>(*z, mask, idxs, &base->z);
*w = int_gather<1>(*w, mask, idxs, &base->w); *w = int_gather<1>(*w, mask, idxs, &base->w);
} }
static void gather_8(const ivec &idxs, const bvec &mask, const void *base, static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
fvec a = zero(), b = zero(), c = zero(), d = zero(); fvec a = zero(), b = zero(), c = zero(), d = zero();
gather_4(idxs, mask, base, r0, r1, r2, r3); gather_4(idxs, mask, base, r0, r1, r2, r3);
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 4 * sizeof(fscal), r4, r5, r6, r7); gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 4 * sizeof(fscal), r4, r5, r6, r7);
} }
static void gather_4(const ivec &idxs, const bvec &mask, const void *base, static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) { fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0 * sizeof(fscal)); *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0 * sizeof(fscal));
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 1 * sizeof(fscal)); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 1 * sizeof(fscal));
@ -1946,13 +1946,13 @@ struct vector_ops<flt_t, AN> {
*z = gather<1>(*z, mask, idxs, &base->z); *z = gather<1>(*z, mask, idxs, &base->z);
*w = int_gather<1>(*w, mask, idxs, &base->w); *w = int_gather<1>(*w, mask, idxs, &base->w);
} }
static void gather_8(const ivec &idxs, const bvec &mask, const void *base, static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
fvec a = zero(), b = zero(), c = zero(), d = zero(); fvec a = zero(), b = zero(), c = zero(), d = zero();
gather_4(idxs, mask, base, r0, r1, r2, r3); gather_4(idxs, mask, base, r0, r1, r2, r3);
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 4 * sizeof(fscal), r4, r5, r6, r7); gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 4 * sizeof(fscal), r4, r5, r6, r7);
} }
static void gather_4(const ivec &idxs, const bvec &mask, const void *base, static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) { fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0 * sizeof(fscal)); *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0 * sizeof(fscal));
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 1 * sizeof(fscal)); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 1 * sizeof(fscal));
@ -2113,7 +2113,7 @@ struct AccumulatorTwiceMixin {
typedef avec_t avec; typedef avec_t avec;
typedef typename HIGH::fscal aarr[BASE::VL] __attribute__((aligned(BASE::ALIGN))); typedef typename HIGH::fscal aarr[BASE::VL] __attribute__((aligned(BASE::ALIGN)));
static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) { static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) {
typename HIGH::fvec blo = BASE::cvtup_lo(b); typename HIGH::fvec blo = BASE::cvtup_lo(b);
typename HIGH::fvec bhi = BASE::cvtup_hi(b); typename HIGH::fvec bhi = BASE::cvtup_hi(b);
@ -2121,7 +2121,7 @@ struct AccumulatorTwiceMixin {
BASE::mask_cvtup(m, &mlo, &mhi); BASE::mask_cvtup(m, &mlo, &mhi);
return avec(HIGH::mask_add(src.lo, mlo, a.lo, blo), HIGH::mask_add(src.hi, mhi, a.hi, bhi)); return avec(HIGH::mask_add(src.lo, mlo, a.lo, blo), HIGH::mask_add(src.hi, mhi, a.hi, bhi));
} }
static typename HIGH::fscal acc_reduce_add(const avec &a) { static typename HIGH::fscal acc_reduce_add(const avec &a) {
return HIGH::reduce_add(a.lo + a.hi); return HIGH::reduce_add(a.lo + a.hi);
} }
@ -2143,13 +2143,13 @@ template<class BASE_flt_t, class HIGH_flt_t, CalculationMode mic>
struct AccumulatorTwiceMixinNone { struct AccumulatorTwiceMixinNone {
typedef vector_ops<BASE_flt_t, mic> BASE; typedef vector_ops<BASE_flt_t, mic> BASE;
typedef vector_ops<HIGH_flt_t, mic> HIGH; typedef vector_ops<HIGH_flt_t, mic> HIGH;
typedef typename HIGH::fvec avec; typedef typename HIGH::fvec avec;
typedef typename HIGH::fscal aarr[BASE::VL]; typedef typename HIGH::fscal aarr[BASE::VL];
static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) { static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) {
return HIGH::mask_add(src, m, a, static_cast<typename HIGH::fvec>(b)); return HIGH::mask_add(src, m, a, static_cast<typename HIGH::fvec>(b));
} }
static typename HIGH::fscal acc_reduce_add(const avec &a) { static typename HIGH::fscal acc_reduce_add(const avec &a) {
return HIGH::reduce_add(a); return HIGH::reduce_add(a);
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -18,110 +18,110 @@
#ifndef LMP_MATH_EXTRA_INTEL_H #ifndef LMP_MATH_EXTRA_INTEL_H
#define LMP_MATH_EXTRA_INTEL_H #define LMP_MATH_EXTRA_INTEL_H
#define ME_quat_to_mat_trans(quat, mat) \ #define ME_quat_to_mat_trans(quat, mat) \
{ \ { \
flt_t quat_w = quat.w; \ flt_t quat_w = quat.w; \
flt_t quat_i = quat.i; \ flt_t quat_i = quat.i; \
flt_t quat_j = quat.j; \ flt_t quat_j = quat.j; \
flt_t quat_k = quat.k; \ flt_t quat_k = quat.k; \
flt_t w2 = quat_w * quat_w; \ flt_t w2 = quat_w * quat_w; \
flt_t i2 = quat_i * quat_i; \ flt_t i2 = quat_i * quat_i; \
flt_t j2 = quat_j * quat_j; \ flt_t j2 = quat_j * quat_j; \
flt_t k2 = quat_k * quat_k; \ flt_t k2 = quat_k * quat_k; \
flt_t twoij = (flt_t)2.0 * quat_i * quat_j; \ flt_t twoij = (flt_t)2.0 * quat_i * quat_j; \
flt_t twoik = (flt_t)2.0 * quat_i * quat_k; \ flt_t twoik = (flt_t)2.0 * quat_i * quat_k; \
flt_t twojk = (flt_t)2.0 * quat_j * quat_k; \ flt_t twojk = (flt_t)2.0 * quat_j * quat_k; \
flt_t twoiw = (flt_t)2.0 * quat_i * quat_w; \ flt_t twoiw = (flt_t)2.0 * quat_i * quat_w; \
flt_t twojw = (flt_t)2.0 * quat_j * quat_w; \ flt_t twojw = (flt_t)2.0 * quat_j * quat_w; \
flt_t twokw = (flt_t)2.0 * quat_k * quat_w; \ flt_t twokw = (flt_t)2.0 * quat_k * quat_w; \
\ \
mat##_0 = w2 + i2 - j2 - k2; \ mat##_0 = w2 + i2 - j2 - k2; \
mat##_3 = twoij - twokw; \ mat##_3 = twoij - twokw; \
mat##_6 = twojw + twoik; \ mat##_6 = twojw + twoik; \
\ \
mat##_1 = twoij + twokw; \ mat##_1 = twoij + twokw; \
mat##_4 = w2 - i2 + j2 - k2; \ mat##_4 = w2 - i2 + j2 - k2; \
mat##_7 = twojk - twoiw; \ mat##_7 = twojk - twoiw; \
\ \
mat##_2 = twoik - twojw; \ mat##_2 = twoik - twojw; \
mat##_5 = twojk + twoiw; \ mat##_5 = twojk + twoiw; \
mat##_8 = w2 - i2 - j2 + k2; \ mat##_8 = w2 - i2 - j2 + k2; \
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
diagonal matrix times a full matrix diagonal matrix times a full matrix
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
#define ME_diag_times3(d, m, ans) \ #define ME_diag_times3(d, m, ans) \
{ \ { \
ans##_0 = d[0] * m##_0; \ ans##_0 = d[0] * m##_0; \
ans##_1 = d[0] * m##_1; \ ans##_1 = d[0] * m##_1; \
ans##_2 = d[0] * m##_2; \ ans##_2 = d[0] * m##_2; \
ans##_3 = d[1] * m##_3; \ ans##_3 = d[1] * m##_3; \
ans##_4 = d[1] * m##_4; \ ans##_4 = d[1] * m##_4; \
ans##_5 = d[1] * m##_5; \ ans##_5 = d[1] * m##_5; \
ans##_6 = d[2] * m##_6; \ ans##_6 = d[2] * m##_6; \
ans##_7 = d[2] * m##_7; \ ans##_7 = d[2] * m##_7; \
ans##_8 = d[2] * m##_8; \ ans##_8 = d[2] * m##_8; \
} }
#define ME_diag_times3a(d, m, ans) \ #define ME_diag_times3a(d, m, ans) \
{ \ { \
ans##_0 = d##_0 * m##_0; \ ans##_0 = d##_0 * m##_0; \
ans##_1 = d##_0 * m##_1; \ ans##_1 = d##_0 * m##_1; \
ans##_2 = d##_0 * m##_2; \ ans##_2 = d##_0 * m##_2; \
ans##_3 = d##_1 * m##_3; \ ans##_3 = d##_1 * m##_3; \
ans##_4 = d##_1 * m##_4; \ ans##_4 = d##_1 * m##_4; \
ans##_5 = d##_1 * m##_5; \ ans##_5 = d##_1 * m##_5; \
ans##_6 = d##_2 * m##_6; \ ans##_6 = d##_2 * m##_6; \
ans##_7 = d##_2 * m##_7; \ ans##_7 = d##_2 * m##_7; \
ans##_8 = d##_2 * m##_8; \ ans##_8 = d##_2 * m##_8; \
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
multiply the transpose of mat1 times mat2 multiply the transpose of mat1 times mat2
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
#define ME_transpose_times3(m1, m2, ans) \ #define ME_transpose_times3(m1, m2, ans) \
{ \ { \
ans##_0 = m1##_0*m2##_0 + m1##_3*m2##_3 + m1##_6*m2##_6; \ ans##_0 = m1##_0*m2##_0 + m1##_3*m2##_3 + m1##_6*m2##_6; \
ans##_1 = m1##_0*m2##_1 + m1##_3*m2##_4 + m1##_6*m2##_7; \ ans##_1 = m1##_0*m2##_1 + m1##_3*m2##_4 + m1##_6*m2##_7; \
ans##_2 = m1##_0*m2##_2 + m1##_3*m2##_5 + m1##_6*m2##_8; \ ans##_2 = m1##_0*m2##_2 + m1##_3*m2##_5 + m1##_6*m2##_8; \
ans##_3 = m1##_1*m2##_0 + m1##_4*m2##_3 + m1##_7*m2##_6; \ ans##_3 = m1##_1*m2##_0 + m1##_4*m2##_3 + m1##_7*m2##_6; \
ans##_4 = m1##_1*m2##_1 + m1##_4*m2##_4 + m1##_7*m2##_7; \ ans##_4 = m1##_1*m2##_1 + m1##_4*m2##_4 + m1##_7*m2##_7; \
ans##_5 = m1##_1*m2##_2 + m1##_4*m2##_5 + m1##_7*m2##_8; \ ans##_5 = m1##_1*m2##_2 + m1##_4*m2##_5 + m1##_7*m2##_8; \
ans##_6 = m1##_2*m2##_0 + m1##_5*m2##_3 + m1##_8*m2##_6; \ ans##_6 = m1##_2*m2##_0 + m1##_5*m2##_3 + m1##_8*m2##_6; \
ans##_7 = m1##_2*m2##_1 + m1##_5*m2##_4 + m1##_8*m2##_7; \ ans##_7 = m1##_2*m2##_1 + m1##_5*m2##_4 + m1##_8*m2##_7; \
ans##_8 = m1##_2*m2##_2 + m1##_5*m2##_5 + m1##_8*m2##_8; \ ans##_8 = m1##_2*m2##_2 + m1##_5*m2##_5 + m1##_8*m2##_8; \
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
normalize a vector, return in ans normalize a vector, return in ans
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
#define ME_normalize3(v0, v1, v2, ans) \ #define ME_normalize3(v0, v1, v2, ans) \
{ \ { \
flt_t scale = (flt_t)1.0 / sqrt(v0*v0+v1*v1+v2*v2); \ flt_t scale = (flt_t)1.0 / sqrt(v0*v0+v1*v1+v2*v2); \
ans##_0 = v0 * scale; \ ans##_0 = v0 * scale; \
ans##_1 = v1 * scale; \ ans##_1 = v1 * scale; \
ans##_2 = v2 * scale; \ ans##_2 = v2 * scale; \
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
add two matrices add two matrices
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
#define ME_plus3(m1, m2, ans) \ #define ME_plus3(m1, m2, ans) \
{ \ { \
ans##_0 = m1##_0 + m2##_0; \ ans##_0 = m1##_0 + m2##_0; \
ans##_1 = m1##_1 + m2##_1; \ ans##_1 = m1##_1 + m2##_1; \
ans##_2 = m1##_2 + m2##_2; \ ans##_2 = m1##_2 + m2##_2; \
ans##_3 = m1##_3 + m2##_3; \ ans##_3 = m1##_3 + m2##_3; \
ans##_4 = m1##_4 + m2##_4; \ ans##_4 = m1##_4 + m2##_4; \
ans##_5 = m1##_5 + m2##_5; \ ans##_5 = m1##_5 + m2##_5; \
ans##_6 = m1##_6 + m2##_6; \ ans##_6 = m1##_6 + m2##_6; \
ans##_7 = m1##_7 + m2##_7; \ ans##_7 = m1##_7 + m2##_7; \
ans##_8 = m1##_8 + m2##_8; \ ans##_8 = m1##_8 + m2##_8; \
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -135,7 +135,7 @@
determinant of a matrix determinant of a matrix
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
#define ME_det3(m) \ #define ME_det3(m) \
( m##_0 * m##_4 * m##_8 - m##_0 * m##_5 * m##_7 - \ ( m##_0 * m##_4 * m##_8 - m##_0 * m##_5 * m##_7 - \
m##_3 * m##_1 * m##_8 + m##_3 * m##_2 * m##_7 + \ m##_3 * m##_1 * m##_8 + m##_3 * m##_2 * m##_7 + \
m##_6 * m##_1 * m##_5 - m##_6 * m##_2 * m##_4 ) m##_6 * m##_1 * m##_5 - m##_6 * m##_2 * m##_4 )
@ -144,8 +144,8 @@
row vector times matrix row vector times matrix
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
#define ME_vecmat(v, m, ans) \ #define ME_vecmat(v, m, ans) \
{ \ { \
ans##_0 = v##_0 * m##_0 + v##_1 * m##_3 + v##_2 * m##_6; \ ans##_0 = v##_0 * m##_0 + v##_1 * m##_3 + v##_2 * m##_6; \
ans##_1 = v##_0 * m##_1 + v##_1 * m##_4 + v##_2 * m##_7; \ ans##_1 = v##_0 * m##_1 + v##_1 * m##_4 + v##_2 * m##_7; \
ans##_2 = v##_0 * m##_2 + v##_1 * m##_5 + v##_2 * m##_8; \ ans##_2 = v##_0 * m##_2 + v##_1 * m##_5 + v##_2 * m##_8; \
@ -155,214 +155,214 @@
cross product of 2 vectors cross product of 2 vectors
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
#define ME_cross3(v1, v2, ans) \ #define ME_cross3(v1, v2, ans) \
{ \ { \
ans##_0 = v1##_1 * v2##_2 - v1##_2 * v2##_1; \ ans##_0 = v1##_1 * v2##_2 - v1##_2 * v2##_1; \
ans##_1 = v1##_2 * v2##_0 - v1##_0 * v2##_2; \ ans##_1 = v1##_2 * v2##_0 - v1##_0 * v2##_2; \
ans##_2 = v1##_0 * v2##_1 - v1##_1 * v2##_0; \ ans##_2 = v1##_0 * v2##_1 - v1##_1 * v2##_0; \
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
cross product of 2 vectors cross product of 2 vectors
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
#define ME_mv0_cross3(m1, v2, ans) \ #define ME_mv0_cross3(m1, v2, ans) \
{ \ { \
ans##_0 = m1##_1 * v2##_2 - m1##_2 * v2##_1; \ ans##_0 = m1##_1 * v2##_2 - m1##_2 * v2##_1; \
ans##_1 = m1##_2 * v2##_0 - m1##_0 * v2##_2; \ ans##_1 = m1##_2 * v2##_0 - m1##_0 * v2##_2; \
ans##_2 = m1##_0 * v2##_1 - m1##_1 * v2##_0; \ ans##_2 = m1##_0 * v2##_1 - m1##_1 * v2##_0; \
} }
#define ME_mv1_cross3(m1, v2, ans) \ #define ME_mv1_cross3(m1, v2, ans) \
{ \ { \
ans##_0 = m1##_4 * v2##_2 - m1##_5 * v2##_1; \ ans##_0 = m1##_4 * v2##_2 - m1##_5 * v2##_1; \
ans##_1 = m1##_5 * v2##_0 - m1##_3 * v2##_2; \ ans##_1 = m1##_5 * v2##_0 - m1##_3 * v2##_2; \
ans##_2 = m1##_3 * v2##_1 - m1##_4 * v2##_0; \ ans##_2 = m1##_3 * v2##_1 - m1##_4 * v2##_0; \
} }
#define ME_mv2_cross3(m1, v2, ans) \ #define ME_mv2_cross3(m1, v2, ans) \
{ \ { \
ans##_0 = m1##_7 * v2##_2 - m1##_8 * v2##_1; \ ans##_0 = m1##_7 * v2##_2 - m1##_8 * v2##_1; \
ans##_1 = m1##_8 * v2##_0 - m1##_6 * v2##_2; \ ans##_1 = m1##_8 * v2##_0 - m1##_6 * v2##_2; \
ans##_2 = m1##_6 * v2##_1 - m1##_7 * v2##_0; \ ans##_2 = m1##_6 * v2##_1 - m1##_7 * v2##_0; \
} }
#define ME_compute_eta_torque(m1, m2, s1, ans) \ #define ME_compute_eta_torque(m1, m2, s1, ans) \
{ \ { \
flt_t den = m1##_3*m1##_2*m1##_7-m1##_0*m1##_5*m1##_7- \ flt_t den = m1##_3*m1##_2*m1##_7-m1##_0*m1##_5*m1##_7- \
m1##_2*m1##_6*m1##_4+m1##_1*m1##_6*m1##_5- \ m1##_2*m1##_6*m1##_4+m1##_1*m1##_6*m1##_5- \
m1##_3*m1##_1*m1##_8+m1##_0*m1##_4*m1##_8; \ m1##_3*m1##_1*m1##_8+m1##_0*m1##_4*m1##_8; \
den = (flt_t)1.0 / den; \ den = (flt_t)1.0 / den; \
\ \
ans##_0 = s1##_0*(m1##_5*m1##_1*m2##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_0- \ ans##_0 = s1##_0*(m1##_5*m1##_1*m2##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_0- \
m1##_4*m2##_2*m1##_2-(flt_t)2.0*m1##_5*m2##_0*m1##_7+ \ m1##_4*m2##_2*m1##_2-(flt_t)2.0*m1##_5*m2##_0*m1##_7+ \
m2##_1*m1##_2*m1##_7-m2##_1*m1##_1*m1##_8- \ m2##_1*m1##_2*m1##_7-m2##_1*m1##_1*m1##_8- \
m1##_3*m1##_8*m2##_1+m1##_6*m1##_5*m2##_1+ \ m1##_3*m1##_8*m2##_1+m1##_6*m1##_5*m2##_1+ \
m1##_3*m2##_2*m1##_7-m2##_2*m1##_6*m1##_4)*den; \ m1##_3*m2##_2*m1##_7-m2##_2*m1##_6*m1##_4)*den; \
\ \
ans##_1 = s1##_0*(m1##_2*m2##_0*m1##_7-m1##_8*m2##_0*m1##_1+ \ ans##_1 = s1##_0*(m1##_2*m2##_0*m1##_7-m1##_8*m2##_0*m1##_1+ \
(flt_t)2.0*m1##_0*m1##_8*m2##_1-m1##_0*m2##_2*m1##_5- \ (flt_t)2.0*m1##_0*m1##_8*m2##_1-m1##_0*m2##_2*m1##_5- \
(flt_t)2.0*m1##_6*m1##_2*m2##_1+m2##_2*m1##_3*m1##_2- \ (flt_t)2.0*m1##_6*m1##_2*m2##_1+m2##_2*m1##_3*m1##_2- \
m1##_8*m1##_3*m2##_0+m1##_6*m2##_0*m1##_5+ \ m1##_8*m1##_3*m2##_0+m1##_6*m2##_0*m1##_5+ \
m1##_6*m2##_2*m1##_1-m2##_2*m1##_0*m1##_7)*den; \ m1##_6*m2##_2*m1##_1-m2##_2*m1##_0*m1##_7)*den; \
\ \
ans##_2 = s1##_0*(m1##_1*m1##_5*m2##_0-m1##_2*m2##_0*m1##_4- \ ans##_2 = s1##_0*(m1##_1*m1##_5*m2##_0-m1##_2*m2##_0*m1##_4- \
m1##_0*m1##_5*m2##_1+m1##_3*m1##_2*m2##_1- \ m1##_0*m1##_5*m2##_1+m1##_3*m1##_2*m2##_1- \
m2##_1*m1##_0*m1##_7-m1##_6*m1##_4*m2##_0+ \ m2##_1*m1##_0*m1##_7-m1##_6*m1##_4*m2##_0+ \
(flt_t)2.0*m1##_4*m1##_0*m2##_2- \ (flt_t)2.0*m1##_4*m1##_0*m2##_2- \
(flt_t)2.0*m1##_3*m2##_2*m1##_1+ \ (flt_t)2.0*m1##_3*m2##_2*m1##_1+ \
m1##_3*m1##_7*m2##_0+m1##_6*m2##_1*m1##_1)*den; \ m1##_3*m1##_7*m2##_0+m1##_6*m2##_1*m1##_1)*den; \
\ \
ans##_3 = s1##_1*(-m1##_4*m2##_5*m1##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_3+ \ ans##_3 = s1##_1*(-m1##_4*m2##_5*m1##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_3+ \
m1##_5*m1##_1*m2##_5-(flt_t)2.0*m1##_5*m2##_3*m1##_7+ \ m1##_5*m1##_1*m2##_5-(flt_t)2.0*m1##_5*m2##_3*m1##_7+ \
m2##_4*m1##_2*m1##_7-m2##_4*m1##_1*m1##_8- \ m2##_4*m1##_2*m1##_7-m2##_4*m1##_1*m1##_8- \
m1##_3*m1##_8*m2##_4+m1##_6*m1##_5*m2##_4- \ m1##_3*m1##_8*m2##_4+m1##_6*m1##_5*m2##_4- \
m2##_5*m1##_6*m1##_4+m1##_3*m2##_5*m1##_7)*den; \ m2##_5*m1##_6*m1##_4+m1##_3*m2##_5*m1##_7)*den; \
\ \
ans##_4 = s1##_1*(m1##_2*m2##_3*m1##_7-m1##_1*m1##_8*m2##_3+ \ ans##_4 = s1##_1*(m1##_2*m2##_3*m1##_7-m1##_1*m1##_8*m2##_3+ \
(flt_t)2.0*m1##_8*m1##_0*m2##_4-m2##_5*m1##_0*m1##_5- \ (flt_t)2.0*m1##_8*m1##_0*m2##_4-m2##_5*m1##_0*m1##_5- \
(flt_t)2.0*m1##_6*m2##_4*m1##_2-m1##_3*m1##_8*m2##_3+ \ (flt_t)2.0*m1##_6*m2##_4*m1##_2-m1##_3*m1##_8*m2##_3+ \
m1##_6*m1##_5*m2##_3+m1##_3*m2##_5*m1##_2- \ m1##_6*m1##_5*m2##_3+m1##_3*m2##_5*m1##_2- \
m1##_0*m2##_5*m1##_7+m2##_5*m1##_1*m1##_6)*den; \ m1##_0*m2##_5*m1##_7+m2##_5*m1##_1*m1##_6)*den; \
\ \
ans##_5 = s1##_1*(m1##_1*m1##_5*m2##_3-m1##_2*m2##_3*m1##_4- \ ans##_5 = s1##_1*(m1##_1*m1##_5*m2##_3-m1##_2*m2##_3*m1##_4- \
m1##_0*m1##_5*m2##_4+m1##_3*m1##_2*m2##_4+ \ m1##_0*m1##_5*m2##_4+m1##_3*m1##_2*m2##_4+ \
(flt_t)2.0*m1##_4*m1##_0*m2##_5-m1##_0*m2##_4*m1##_7+ \ (flt_t)2.0*m1##_4*m1##_0*m2##_5-m1##_0*m2##_4*m1##_7+ \
m1##_1*m1##_6*m2##_4-m2##_3*m1##_6*m1##_4- \ m1##_1*m1##_6*m2##_4-m2##_3*m1##_6*m1##_4- \
(flt_t)2.0*m1##_3*m1##_1*m2##_5+m1##_3*m2##_3*m1##_7)* \ (flt_t)2.0*m1##_3*m1##_1*m2##_5+m1##_3*m2##_3*m1##_7)* \
den; \ den; \
\ \
ans##_6 = s1##_2*(-m1##_4*m1##_2*m2##_8+m1##_1*m1##_5*m2##_8+ \ ans##_6 = s1##_2*(-m1##_4*m1##_2*m2##_8+m1##_1*m1##_5*m2##_8+ \
(flt_t)2.0*m1##_4*m2##_6*m1##_8-m1##_1*m2##_7*m1##_8+ \ (flt_t)2.0*m1##_4*m2##_6*m1##_8-m1##_1*m2##_7*m1##_8+ \
m1##_2*m1##_7*m2##_7-(flt_t)2.0*m2##_6*m1##_7*m1##_5- \ m1##_2*m1##_7*m2##_7-(flt_t)2.0*m2##_6*m1##_7*m1##_5- \
m1##_3*m2##_7*m1##_8+m1##_5*m1##_6*m2##_7- \ m1##_3*m2##_7*m1##_8+m1##_5*m1##_6*m2##_7- \
m1##_4*m1##_6*m2##_8+m1##_7*m1##_3*m2##_8)*den; \ m1##_4*m1##_6*m2##_8+m1##_7*m1##_3*m2##_8)*den; \
\ \
ans##_7 = s1##_2*-(m1##_1*m1##_8*m2##_6-m1##_2*m2##_6*m1##_7- \ ans##_7 = s1##_2*-(m1##_1*m1##_8*m2##_6-m1##_2*m2##_6*m1##_7- \
(flt_t)2.0*m2##_7*m1##_0*m1##_8+m1##_5*m2##_8*m1##_0+ \ (flt_t)2.0*m2##_7*m1##_0*m1##_8+m1##_5*m2##_8*m1##_0+ \
(flt_t)2.0*m2##_7*m1##_2*m1##_6+m1##_3*m2##_6*m1##_8- \ (flt_t)2.0*m2##_7*m1##_2*m1##_6+m1##_3*m2##_6*m1##_8- \
m1##_3*m1##_2*m2##_8-m1##_5*m1##_6*m2##_6+ \ m1##_3*m1##_2*m2##_8-m1##_5*m1##_6*m2##_6+ \
m1##_0*m2##_8*m1##_7-m2##_8*m1##_1*m1##_6)*den; \ m1##_0*m2##_8*m1##_7-m2##_8*m1##_1*m1##_6)*den; \
\ \
ans##_8 = s1##_2*(m1##_1*m1##_5*m2##_6-m1##_2*m2##_6*m1##_4- \ ans##_8 = s1##_2*(m1##_1*m1##_5*m2##_6-m1##_2*m2##_6*m1##_4- \
m1##_0*m1##_5*m2##_7+m1##_3*m1##_2*m2##_7- \ m1##_0*m1##_5*m2##_7+m1##_3*m1##_2*m2##_7- \
m1##_4*m1##_6*m2##_6-m1##_7*m2##_7*m1##_0+ \ m1##_4*m1##_6*m2##_6-m1##_7*m2##_7*m1##_0+ \
(flt_t)2.0*m1##_4*m2##_8*m1##_0+m1##_7*m1##_3*m2##_6+ \ (flt_t)2.0*m1##_4*m2##_8*m1##_0+m1##_7*m1##_3*m2##_6+ \
m1##_6*m1##_1*m2##_7-(flt_t)2.0*m2##_8*m1##_3*m1##_1)* \ m1##_6*m1##_1*m2##_7-(flt_t)2.0*m2##_8*m1##_3*m1##_1)* \
den; \ den; \
} }
#define ME_vcopy4(dst,src) \ #define ME_vcopy4(dst,src) \
dst##_0 = src##_0; \ dst##_0 = src##_0; \
dst##_1 = src##_1; \ dst##_1 = src##_1; \
dst##_2 = src##_2; \ dst##_2 = src##_2; \
dst##_3 = src##_3; dst##_3 = src##_3;
#define ME_mldivide3(m1, v_0, v_1, v_2, ans, error) \ #define ME_mldivide3(m1, v_0, v_1, v_2, ans, error) \
{ \ { \
flt_t aug_0, aug_1, aug_2, aug_3, aug_4, aug_5; \ flt_t aug_0, aug_1, aug_2, aug_3, aug_4, aug_5; \
flt_t aug_6, aug_7, aug_8, aug_9, aug_10, aug_11, t; \ flt_t aug_6, aug_7, aug_8, aug_9, aug_10, aug_11, t; \
\ \
aug_3 = v_0; \ aug_3 = v_0; \
aug_0 = m1##_0; \ aug_0 = m1##_0; \
aug_1 = m1##_1; \ aug_1 = m1##_1; \
aug_2 = m1##_2; \ aug_2 = m1##_2; \
aug_7 = v_1; \ aug_7 = v_1; \
aug_4 = m1##_3; \ aug_4 = m1##_3; \
aug_5 = m1##_4; \ aug_5 = m1##_4; \
aug_6 = m1##_5; \ aug_6 = m1##_5; \
aug_11 = v_2; \ aug_11 = v_2; \
aug_8 = m1##_6; \ aug_8 = m1##_6; \
aug_9 = m1##_7; \ aug_9 = m1##_7; \
aug_10 = m1##_8; \ aug_10 = m1##_8; \
\ \
if (fabs(aug_4) > fabs(aug_0)) { \ if (fabs(aug_4) > fabs(aug_0)) { \
flt_t swapt; \ flt_t swapt; \
swapt = aug_0; aug_0 = aug_4; aug_4 = swapt; \ swapt = aug_0; aug_0 = aug_4; aug_4 = swapt; \
swapt = aug_1; aug_1 = aug_5; aug_5 = swapt; \ swapt = aug_1; aug_1 = aug_5; aug_5 = swapt; \
swapt = aug_2; aug_2 = aug_6; aug_6 = swapt; \ swapt = aug_2; aug_2 = aug_6; aug_6 = swapt; \
swapt = aug_3; aug_3 = aug_7; aug_7 = swapt; \ swapt = aug_3; aug_3 = aug_7; aug_7 = swapt; \
} \ } \
if (fabs(aug_8) > fabs(aug_0)) { \ if (fabs(aug_8) > fabs(aug_0)) { \
flt_t swapt; \ flt_t swapt; \
swapt = aug_0; aug_0 = aug_8; aug_8 = swapt; \ swapt = aug_0; aug_0 = aug_8; aug_8 = swapt; \
swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \ swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \
swapt = aug_2; aug_2 = aug_10; aug_10 = swapt; \ swapt = aug_2; aug_2 = aug_10; aug_10 = swapt; \
swapt = aug_3; aug_3 = aug_11; aug_11 = swapt; \ swapt = aug_3; aug_3 = aug_11; aug_11 = swapt; \
} \ } \
\ \
if (aug_0 != (flt_t)0.0) { \ if (aug_0 != (flt_t)0.0) { \
} else if (aug_4 != (flt_t)0.0) { \ } else if (aug_4 != (flt_t)0.0) { \
flt_t swapt; \ flt_t swapt; \
swapt = aug_0; aug_0 = aug_4; aug_4 = swapt; \ swapt = aug_0; aug_0 = aug_4; aug_4 = swapt; \
swapt = aug_1; aug_1 = aug_5; aug_5 = swapt; \ swapt = aug_1; aug_1 = aug_5; aug_5 = swapt; \
swapt = aug_2; aug_2 = aug_6; aug_6 = swapt; \ swapt = aug_2; aug_2 = aug_6; aug_6 = swapt; \
swapt = aug_3; aug_3 = aug_7; aug_7 = swapt; \ swapt = aug_3; aug_3 = aug_7; aug_7 = swapt; \
} else if (aug_8 != (flt_t)0.0) { \ } else if (aug_8 != (flt_t)0.0) { \
flt_t swapt; \ flt_t swapt; \
swapt = aug_0; aug_0 = aug_8; aug_8 = swapt; \ swapt = aug_0; aug_0 = aug_8; aug_8 = swapt; \
swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \ swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \
swapt = aug_2; aug_2 = aug_10; aug_10 = swapt; \ swapt = aug_2; aug_2 = aug_10; aug_10 = swapt; \
swapt = aug_3; aug_3 = aug_11; aug_11 = swapt; \ swapt = aug_3; aug_3 = aug_11; aug_11 = swapt; \
} else \ } else \
error = 1; \ error = 1; \
\ \
t = aug_4 / aug_0; \ t = aug_4 / aug_0; \
aug_5 -= t * aug_1; \ aug_5 -= t * aug_1; \
aug_6 -= t * aug_2; \ aug_6 -= t * aug_2; \
aug_7 -= t * aug_3; \ aug_7 -= t * aug_3; \
t = aug_8 / aug_0; \ t = aug_8 / aug_0; \
aug_9 -= t * aug_1; \ aug_9 -= t * aug_1; \
aug_10 -= t * aug_2; \ aug_10 -= t * aug_2; \
aug_11 -= t * aug_3; \ aug_11 -= t * aug_3; \
\ \
if (fabs(aug_9) > fabs(aug_5)) { \ if (fabs(aug_9) > fabs(aug_5)) { \
flt_t swapt; \ flt_t swapt; \
swapt = aug_4; aug_4 = aug_8; aug_8 = swapt; \
swapt = aug_5; aug_5 = aug_9; aug_9 = swapt; \
swapt = aug_6; aug_6 = aug_10; aug_10 = swapt; \
swapt = aug_7; aug_7 = aug_11; aug_11 = swapt; \
} \
\
if (aug_5 != (flt_t)0.0) { \
} else if (aug_9 != (flt_t)0.0) { \
flt_t swapt; \
swapt = aug_4; aug_4 = aug_8; aug_8 = swapt; \ swapt = aug_4; aug_4 = aug_8; aug_8 = swapt; \
swapt = aug_5; aug_5 = aug_9; aug_9 = swapt; \ swapt = aug_5; aug_5 = aug_9; aug_9 = swapt; \
swapt = aug_6; aug_6 = aug_10; aug_10 = swapt; \ swapt = aug_6; aug_6 = aug_10; aug_10 = swapt; \
swapt = aug_7; aug_7 = aug_11; aug_11 = swapt; \ swapt = aug_7; aug_7 = aug_11; aug_11 = swapt; \
} \ } \
\ \
t = aug_9 / aug_5; \ if (aug_5 != (flt_t)0.0) { \
aug_10 -= t * aug_6; \ } else if (aug_9 != (flt_t)0.0) { \
aug_11 -= t * aug_7; \ flt_t swapt; \
\ swapt = aug_4; aug_4 = aug_8; aug_8 = swapt; \
if (aug_10 == (flt_t)0.0) \ swapt = aug_5; aug_5 = aug_9; aug_9 = swapt; \
error = 1; \ swapt = aug_6; aug_6 = aug_10; aug_10 = swapt; \
\ swapt = aug_7; aug_7 = aug_11; aug_11 = swapt; \
ans##_2 = aug_11/aug_10; \ } \
t = (flt_t)0.0; \ \
t += aug_6 * ans##_2; \ t = aug_9 / aug_5; \
ans##_1 = (aug_7-t) / aug_5; \ aug_10 -= t * aug_6; \
t = (flt_t)0.0; \ aug_11 -= t * aug_7; \
t += aug_1 * ans##_1; \ \
t += aug_2 * ans##_2; \ if (aug_10 == (flt_t)0.0) \
ans##_0 = (aug_3 - t) / aug_0; \ error = 1; \
\
ans##_2 = aug_11/aug_10; \
t = (flt_t)0.0; \
t += aug_6 * ans##_2; \
ans##_1 = (aug_7-t) / aug_5; \
t = (flt_t)0.0; \
t += aug_1 * ans##_1; \
t += aug_2 * ans##_2; \
ans##_0 = (aug_3 - t) / aug_0; \
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
normalize a quaternion normalize a quaternion
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
#define ME_qnormalize(q) \ #define ME_qnormalize(q) \
{ \ { \
double norm = 1.0 / \ double norm = 1.0 / \
sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k); \ sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k); \
q##_w *= norm; \ q##_w *= norm; \
q##_i *= norm; \ q##_i *= norm; \
q##_j *= norm; \ q##_j *= norm; \
q##_k *= norm; \ q##_k *= norm; \
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -373,106 +373,106 @@
and divide by principal moments and divide by principal moments
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w) \ #define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w) \
{ \ { \
double wbody_0, wbody_1, wbody_2; \ double wbody_0, wbody_1, wbody_2; \
double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8; \ double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8; \
\ \
double w2 = quat##_w * quat##_w; \ double w2 = quat##_w * quat##_w; \
double i2 = quat##_i * quat##_i; \ double i2 = quat##_i * quat##_i; \
double j2 = quat##_j * quat##_j; \ double j2 = quat##_j * quat##_j; \
double k2 = quat##_k * quat##_k; \ double k2 = quat##_k * quat##_k; \
double twoij = 2.0 * quat##_i * quat##_j; \ double twoij = 2.0 * quat##_i * quat##_j; \
double twoik = 2.0 * quat##_i * quat##_k; \ double twoik = 2.0 * quat##_i * quat##_k; \
double twojk = 2.0 * quat##_j * quat##_k; \ double twojk = 2.0 * quat##_j * quat##_k; \
double twoiw = 2.0 * quat##_i * quat##_w; \ double twoiw = 2.0 * quat##_i * quat##_w; \
double twojw = 2.0 * quat##_j * quat##_w; \ double twojw = 2.0 * quat##_j * quat##_w; \
double twokw = 2.0 * quat##_k * quat##_w; \ double twokw = 2.0 * quat##_k * quat##_w; \
\ \
rot##_0 = w2 + i2 - j2 - k2; \ rot##_0 = w2 + i2 - j2 - k2; \
rot##_1 = twoij - twokw; \ rot##_1 = twoij - twokw; \
rot##_2 = twojw + twoik; \ rot##_2 = twojw + twoik; \
\ \
rot##_3 = twoij + twokw; \ rot##_3 = twoij + twokw; \
rot##_4 = w2 - i2 + j2 - k2; \ rot##_4 = w2 - i2 + j2 - k2; \
rot##_5 = twojk - twoiw; \ rot##_5 = twojk - twoiw; \
\ \
rot##_6 = twoik - twojw; \ rot##_6 = twoik - twojw; \
rot##_7 = twojk + twoiw; \ rot##_7 = twojk + twoiw; \
rot##_8 = w2 - i2 - j2 + k2; \ rot##_8 = w2 - i2 - j2 + k2; \
\ \
wbody_0 = rot##_0*m##_0 + rot##_3*m##_1 + rot##_6*m##_2; \ wbody_0 = rot##_0*m##_0 + rot##_3*m##_1 + rot##_6*m##_2; \
wbody_1 = rot##_1*m##_0 + rot##_4*m##_1 + rot##_7*m##_2; \ wbody_1 = rot##_1*m##_0 + rot##_4*m##_1 + rot##_7*m##_2; \
wbody_2 = rot##_2*m##_0 + rot##_5*m##_1 + rot##_8*m##_2; \ wbody_2 = rot##_2*m##_0 + rot##_5*m##_1 + rot##_8*m##_2; \
\ \
wbody_0 *= moments_0; \ wbody_0 *= moments_0; \
wbody_1 *= moments_1; \ wbody_1 *= moments_1; \
wbody_2 *= moments_2; \ wbody_2 *= moments_2; \
\ \
w##_0 = rot##_0*wbody_0 + rot##_1*wbody_1 + rot##_2*wbody_2; \ w##_0 = rot##_0*wbody_0 + rot##_1*wbody_1 + rot##_2*wbody_2; \
w##_1 = rot##_3*wbody_0 + rot##_4*wbody_1 + rot##_5*wbody_2; \ w##_1 = rot##_3*wbody_0 + rot##_4*wbody_1 + rot##_5*wbody_2; \
w##_2 = rot##_6*wbody_0 + rot##_7*wbody_1 + rot##_8*wbody_2; \ w##_2 = rot##_6*wbody_0 + rot##_7*wbody_1 + rot##_8*wbody_2; \
} }
#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2) \ #define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2) \
{ \ { \
angmomin[0] += dtf * torque[0]; \ angmomin[0] += dtf * torque[0]; \
double angmom_0 = angmomin[0]; \ double angmom_0 = angmomin[0]; \
angmomin[1] += dtf * torque[1]; \ angmomin[1] += dtf * torque[1]; \
double angmom_1 = angmomin[1]; \ double angmom_1 = angmomin[1]; \
angmomin[2] += dtf * torque[2]; \ angmomin[2] += dtf * torque[2]; \
double angmom_2 = angmomin[2]; \ double angmom_2 = angmomin[2]; \
\ \
double quat_w = quatin[0]; \ double quat_w = quatin[0]; \
double quat_i = quatin[1]; \ double quat_i = quatin[1]; \
double quat_j = quatin[2]; \ double quat_j = quatin[2]; \
double quat_k = quatin[3]; \ double quat_k = quatin[3]; \
\ \
double omega_0, omega_1, omega_2; \ double omega_0, omega_1, omega_2; \
ME_mq_to_omega(angmom,quat,i0,i1,i2,omega); \ ME_mq_to_omega(angmom,quat,i0,i1,i2,omega); \
\ \
double wq_0, wq_1, wq_2, wq_3; \ double wq_0, wq_1, wq_2, wq_3; \
wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k; \ wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k; \
wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j; \ wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j; \
wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k; \ wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k; \
wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i; \ wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i; \
\ \
double qfull_w, qfull_i, qfull_j, qfull_k; \ double qfull_w, qfull_i, qfull_j, qfull_k; \
qfull_w = quat_w + dtq * wq_0; \ qfull_w = quat_w + dtq * wq_0; \
qfull_i = quat_i + dtq * wq_1; \ qfull_i = quat_i + dtq * wq_1; \
qfull_j = quat_j + dtq * wq_2; \ qfull_j = quat_j + dtq * wq_2; \
qfull_k = quat_k + dtq * wq_3; \ qfull_k = quat_k + dtq * wq_3; \
ME_qnormalize(qfull); \ ME_qnormalize(qfull); \
\ \
double qhalf_w, qhalf_i, qhalf_j, qhalf_k; \ double qhalf_w, qhalf_i, qhalf_j, qhalf_k; \
qhalf_w = quat_w + 0.5*dtq * wq_0; \ qhalf_w = quat_w + 0.5*dtq * wq_0; \
qhalf_i = quat_i + 0.5*dtq * wq_1; \ qhalf_i = quat_i + 0.5*dtq * wq_1; \
qhalf_j = quat_j + 0.5*dtq * wq_2; \ qhalf_j = quat_j + 0.5*dtq * wq_2; \
qhalf_k = quat_k + 0.5*dtq * wq_3; \ qhalf_k = quat_k + 0.5*dtq * wq_3; \
ME_qnormalize(qhalf); \ ME_qnormalize(qhalf); \
\ \
ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega); \ ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega); \
wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k; \ wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k; \
wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j; \ wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j; \
wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k; \ wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k; \
wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i; \ wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i; \
\ \
qhalf_w += 0.5*dtq * wq_0; \ qhalf_w += 0.5*dtq * wq_0; \
qhalf_i += 0.5*dtq * wq_1; \ qhalf_i += 0.5*dtq * wq_1; \
qhalf_j += 0.5*dtq * wq_2; \ qhalf_j += 0.5*dtq * wq_2; \
qhalf_k += 0.5*dtq * wq_3; \ qhalf_k += 0.5*dtq * wq_3; \
ME_qnormalize(qhalf); \ ME_qnormalize(qhalf); \
\ \
quat_w = 2.0*qhalf_w - qfull_w; \ quat_w = 2.0*qhalf_w - qfull_w; \
quat_i = 2.0*qhalf_i - qfull_i; \ quat_i = 2.0*qhalf_i - qfull_i; \
quat_j = 2.0*qhalf_j - qfull_j; \ quat_j = 2.0*qhalf_j - qfull_j; \
quat_k = 2.0*qhalf_k - qfull_k; \ quat_k = 2.0*qhalf_k - qfull_k; \
ME_qnormalize(quat); \ ME_qnormalize(quat); \
\ \
quatin[0] = quat_w; \ quatin[0] = quat_w; \
quatin[1] = quat_i; \ quatin[1] = quat_i; \
quatin[2] = quat_j; \ quatin[2] = quat_j; \
quatin[3] = quat_k; \ quatin[3] = quat_k; \
} }
#endif #endif

View File

@ -51,11 +51,11 @@ NBinIntel::~NBinIntel() {
const int * bins = this->bins; const int * bins = this->bins;
const int * _atombin = this->_atombin; const int * _atombin = this->_atombin;
const int * _binpacked = this->_binpacked; const int * _binpacked = this->_binpacked;
#pragma offload_transfer target(mic:_cop) \ #pragma offload_transfer target(mic:_cop) \
nocopy(binhead,bins,_atombin,_binpacked:alloc_if(0) free_if(1)) nocopy(binhead,bins,_atombin,_binpacked:alloc_if(0) free_if(1))
} }
#endif #endif
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
setup for bin_atoms() setup for bin_atoms()
@ -70,8 +70,8 @@ void NBinIntel::bin_atoms_setup(int nall)
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
if (_offload_alloc) { if (_offload_alloc) {
const int * binhead = this->binhead; const int * binhead = this->binhead;
#pragma offload_transfer target(mic:_cop) \ #pragma offload_transfer target(mic:_cop) \
nocopy(binhead:alloc_if(0) free_if(1)) nocopy(binhead:alloc_if(0) free_if(1))
} }
#endif #endif
@ -98,8 +98,8 @@ void NBinIntel::bin_atoms_setup(int nall)
const int * bins = this->bins; const int * bins = this->bins;
const int * _atombin = this->_atombin; const int * _atombin = this->_atombin;
const int * _binpacked = this->_binpacked; const int * _binpacked = this->_binpacked;
#pragma offload_transfer target(mic:_cop) \ #pragma offload_transfer target(mic:_cop) \
nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1)) nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1))
} }
#endif #endif
memory->destroy(bins); memory->destroy(bins);
@ -157,10 +157,10 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
const flt_t dx = (INTEL_BIGP - bboxhi[0]); const flt_t dx = (INTEL_BIGP - bboxhi[0]);
const flt_t dy = (INTEL_BIGP - bboxhi[1]); const flt_t dy = (INTEL_BIGP - bboxhi[1]);
const flt_t dz = (INTEL_BIGP - bboxhi[2]); const flt_t dz = (INTEL_BIGP - bboxhi[2]);
if (dx * dx + dy * dy + dz * dz < if (dx * dx + dy * dy + dz * dz <
static_cast<flt_t>(neighbor->cutneighmaxsq)) static_cast<flt_t>(neighbor->cutneighmaxsq))
error->one(FLERR, error->one(FLERR,
"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}."); "Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
} }
// ---------- Grow and cast/pack buffers ------------- // ---------- Grow and cast/pack buffers -------------
@ -183,7 +183,7 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
{ {
int ifrom, ito, tid; int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads, IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
sizeof(ATOM_T)); sizeof(ATOM_T));
buffers->thr_pack(ifrom, ito, 0); buffers->thr_pack(ifrom, ito, 0);
} }
_fix->stop_watch(TIME_PACK); _fix->stop_watch(TIME_PACK);

View File

@ -70,48 +70,48 @@ fbi(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
#endif #endif
buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end, buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end,
_fix->nbor_pack_width()); _fix->nbor_pack_width());
int need_ic = 0; int need_ic = 0;
if (atom->molecular) if (atom->molecular)
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax, dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
neighbor->cutneighmax); neighbor->cutneighmax);
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
if (_fix->three_body_neighbor()) { if (_fix->three_body_neighbor()) {
if (need_ic) { if (need_ic) {
if (offload_noghost) { if (offload_noghost) {
bin_newton<flt_t,acc_t,1,1,1,0,1>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,1,1,1,0,1>(1, list, buffers, 0, off_end);
bin_newton<flt_t,acc_t,1,1,1,0,1>(0, list, buffers, host_start, nlocal, off_end); bin_newton<flt_t,acc_t,1,1,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
} else { } else {
bin_newton<flt_t,acc_t,0,1,1,0,1>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,0,1,1,0,1>(1, list, buffers, 0, off_end);
bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal); bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal);
} }
} else { } else {
if (offload_noghost) { if (offload_noghost) {
bin_newton<flt_t,acc_t,1,0,1,0,1>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,1,0,1,0,1>(1, list, buffers, 0, off_end);
bin_newton<flt_t,acc_t,1,0,1,0,1>(0, list, buffers, host_start, nlocal, off_end); bin_newton<flt_t,acc_t,1,0,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
} else { } else {
bin_newton<flt_t,acc_t,0,0,1,0,1>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,0,0,1,0,1>(1, list, buffers, 0, off_end);
bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal); bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal);
} }
} }
} else { } else {
if (need_ic) { if (need_ic) {
if (offload_noghost) { if (offload_noghost) {
bin_newton<flt_t,acc_t,1,1,1,0,0>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,1,1,1,0,0>(1, list, buffers, 0, off_end);
bin_newton<flt_t,acc_t,1,1,1,0,0>(0, list, buffers, host_start, nlocal, off_end); bin_newton<flt_t,acc_t,1,1,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
} else { } else {
bin_newton<flt_t,acc_t,0,1,1,0,0>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,0,1,1,0,0>(1, list, buffers, 0, off_end);
bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal); bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal);
} }
} else { } else {
if (offload_noghost) { if (offload_noghost) {
bin_newton<flt_t,acc_t,1,0,1,0,0>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,1,0,1,0,0>(1, list, buffers, 0, off_end);
bin_newton<flt_t,acc_t,1,0,1,0,0>(0, list, buffers, host_start, nlocal, off_end); bin_newton<flt_t,acc_t,1,0,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
} else { } else {
bin_newton<flt_t,acc_t,0,0,1,0,0>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,0,0,1,0,0>(1, list, buffers, 0, off_end);
bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal); bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal);
} }
} }
} }

View File

@ -15,7 +15,7 @@
NPairStyle(full/bin/intel, NPairStyle(full/bin/intel,
NPairFullBinIntel, NPairFullBinIntel,
NP_FULL | NP_BIN | NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_FULL | NP_BIN | NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI |
NP_INTEL) NP_INTEL)
#else #else

View File

@ -26,7 +26,7 @@ using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
NPairHalfBinNewtonIntel::NPairHalfBinNewtonIntel(LAMMPS *lmp) : NPairHalfBinNewtonIntel::NPairHalfBinNewtonIntel(LAMMPS *lmp) :
NPairIntel(lmp) {} NPairIntel(lmp) {}
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -75,14 +75,14 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
int need_ic = 0; int need_ic = 0;
if (atom->molecular) if (atom->molecular)
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax, dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
neighbor->cutneighmax); neighbor->cutneighmax);
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
if (need_ic) { if (need_ic) {
if (offload_noghost) { if (offload_noghost) {
bin_newton<flt_t,acc_t,1,1,0,0,0>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,1,1,0,0,0>(1, list, buffers, 0, off_end);
bin_newton<flt_t,acc_t,1,1,0,0,0>(0, list, buffers, host_start, nlocal, bin_newton<flt_t,acc_t,1,1,0,0,0>(0, list, buffers, host_start, nlocal,
off_end); off_end);
} else { } else {
bin_newton<flt_t,acc_t,0,1,0,0,0>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,0,1,0,0,0>(1, list, buffers, 0, off_end);
bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal); bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
@ -90,7 +90,7 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
} else { } else {
if (offload_noghost) { if (offload_noghost) {
bin_newton<flt_t,acc_t,1,0,0,0,0>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,1,0,0,0,0>(1, list, buffers, 0, off_end);
bin_newton<flt_t,acc_t,1,0,0,0,0>(0, list, buffers, host_start, nlocal, bin_newton<flt_t,acc_t,1,0,0,0,0>(0, list, buffers, host_start, nlocal,
off_end); off_end);
} else { } else {
bin_newton<flt_t,acc_t,0,0,0,0,0>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,0,0,0,0,0>(1, list, buffers, 0, off_end);
@ -98,7 +98,7 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
} }
} }
#else #else
if (need_ic) if (need_ic)
bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal); bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
else else
bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal); bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal);

View File

@ -26,7 +26,7 @@ using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
NPairHalfBinNewtonTriIntel::NPairHalfBinNewtonTriIntel(LAMMPS *lmp) : NPairHalfBinNewtonTriIntel::NPairHalfBinNewtonTriIntel(LAMMPS *lmp) :
NPairIntel(lmp) {} NPairIntel(lmp) {}
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -75,14 +75,14 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
int need_ic = 0; int need_ic = 0;
if (atom->molecular) if (atom->molecular)
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax, dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
neighbor->cutneighmax); neighbor->cutneighmax);
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
if (need_ic) { if (need_ic) {
if (offload_noghost) { if (offload_noghost) {
bin_newton<flt_t,acc_t,1,1,0,1,0>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,1,1,0,1,0>(1, list, buffers, 0, off_end);
bin_newton<flt_t,acc_t,1,1,0,1,0>(0, list, buffers, host_start, nlocal, bin_newton<flt_t,acc_t,1,1,0,1,0>(0, list, buffers, host_start, nlocal,
off_end); off_end);
} else { } else {
bin_newton<flt_t,acc_t,0,1,0,1,0>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,0,1,0,1,0>(1, list, buffers, 0, off_end);
bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal); bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal);
@ -90,8 +90,8 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
} else { } else {
if (offload_noghost) { if (offload_noghost) {
bin_newton<flt_t,acc_t,1,0,0,1,0>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,1,0,0,1,0>(1, list, buffers, 0, off_end);
bin_newton<flt_t,acc_t,1,0,0,1,0>(0, list, buffers, host_start, nlocal, bin_newton<flt_t,acc_t,1,0,0,1,0>(0, list, buffers, host_start, nlocal,
off_end); off_end);
} else { } else {
bin_newton<flt_t,acc_t,0,0,0,1,0>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,0,0,0,1,0>(1, list, buffers, 0, off_end);
bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal); bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal);

View File

@ -40,7 +40,7 @@ NPairIntel::~NPairIntel() {
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
if (_off_map_stencil) { if (_off_map_stencil) {
const int * stencil = this->stencil; const int * stencil = this->stencil;
#pragma offload_transfer target(mic:_cop) \ #pragma offload_transfer target(mic:_cop) \
nocopy(stencil:alloc_if(0) free_if(1)) nocopy(stencil:alloc_if(0) free_if(1))
} }
#endif #endif
@ -49,10 +49,10 @@ NPairIntel::~NPairIntel() {
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t, int offload_noghost, int need_ic, template <class flt_t, class acc_t, int offload_noghost, int need_ic,
int FULL, int TRI, int THREE> int FULL, int TRI, int THREE>
void NPairIntel::bin_newton(const int offload, NeighList *list, void NPairIntel::bin_newton(const int offload, NeighList *list,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const int astart, const int aend, const int astart, const int aend,
const int offload_end) { const int offload_end) {
if (aend-astart == 0) return; if (aend-astart == 0) return;
@ -66,7 +66,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
if (THREE == 0 && offload) { if (THREE == 0 && offload) {
if (INTEL_MIC_NBOR_PAD > 1) if (INTEL_MIC_NBOR_PAD > 1)
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t); pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
} else } else
#endif #endif
if (THREE == 0 && INTEL_NBOR_PAD > 1) if (THREE == 0 && INTEL_NBOR_PAD > 1)
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t); pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
@ -120,7 +120,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
overflow = _fix->get_off_overflow_flag(); overflow = _fix->get_off_overflow_flag();
_fix->stop_watch(TIME_HOST_NEIGHBOR); _fix->stop_watch(TIME_HOST_NEIGHBOR);
_fix->start_watch(TIME_OFFLOAD_LATENCY); _fix->start_watch(TIME_OFFLOAD_LATENCY);
} else } else
#endif #endif
{ {
tnum = comm->nthreads; tnum = comm->nthreads;
@ -193,8 +193,8 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
int end = stencil[k] + 1; int end = stencil[k] + 1;
for (int kk = k + 1; kk < nstencil; kk++) { for (int kk = k + 1; kk < nstencil; kk++) {
if (stencil[kk-1]+1 == stencil[kk]) { if (stencil[kk-1]+1 == stencil[kk]) {
end++; end++;
k++; k++;
} else break; } else break;
} }
binend[nstencilp] = end; binend[nstencilp] = end;
@ -214,16 +214,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
int tid, ifrom, ito; int tid, ifrom, ito;
if (THREE) { if (THREE) {
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width); IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
} else { } else {
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads); IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
} }
ifrom += astart; ifrom += astart;
ito += astart; ito += astart;
int e_ito = ito; int e_ito = ito;
if (THREE && ito == num) { if (THREE && ito == num) {
int imod = ito % pack_width; int imod = ito % pack_width;
if (imod) e_ito += pack_width - imod; if (imod) e_ito += pack_width - imod;
} }
const int list_size = (e_ito + tid * 2 + 2) * maxnbors; const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
@ -251,313 +251,313 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
// loop over all atoms in other bins in stencil, store every pair // loop over all atoms in other bins in stencil, store every pair
int istart, icount, ncount, oldbin = -9999999, lane, max_chunk; int istart, icount, ncount, oldbin = -9999999, lane, max_chunk;
if (THREE) { if (THREE) {
lane = 0; lane = 0;
max_chunk = 0; max_chunk = 0;
} }
for (int i = ifrom; i < ito; i++) { for (int i = ifrom; i < ito; i++) {
const flt_t xtmp = x[i].x; const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y; const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z; const flt_t ztmp = x[i].z;
const int itype = x[i].w; const int itype = x[i].w;
tagint itag; tagint itag;
if (THREE) itag = tag[i]; if (THREE) itag = tag[i];
const int ioffset = ntypes * itype; const int ioffset = ntypes * itype;
const int ibin = atombin[i]; const int ibin = atombin[i];
if (ibin != oldbin) { if (ibin != oldbin) {
oldbin = ibin; oldbin = ibin;
ncount = 0; ncount = 0;
for (int k = 0; k < nstencilp; k++) { for (int k = 0; k < nstencilp; k++) {
const int bstart = binhead[ibin + binstart[k]]; const int bstart = binhead[ibin + binstart[k]];
const int bend = binhead[ibin + binend[k]]; const int bend = binhead[ibin + binend[k]];
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma simd #pragma simd
#endif #endif
for (int jj = bstart; jj < bend; jj++) for (int jj = bstart; jj < bend; jj++)
tj[ncount++] = binpacked[jj]; tj[ncount++] = binpacked[jj];
} }
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma simd #pragma simd
#endif #endif
for (int u = 0; u < ncount; u++) { for (int u = 0; u < ncount; u++) {
const int j = tj[u]; const int j = tj[u];
tx[u] = x[j].x; tx[u] = x[j].x;
ty[u] = x[j].y; ty[u] = x[j].y;
tz[u] = x[j].z; tz[u] = x[j].z;
tjtype[u] = x[j].w; tjtype[u] = x[j].w;
} }
if (FULL == 0 || TRI == 1) { if (FULL == 0 || TRI == 1) {
icount = 0; icount = 0;
istart = ncount; istart = ncount;
const int alignb = INTEL_DATA_ALIGN / sizeof(int); const int alignb = INTEL_DATA_ALIGN / sizeof(int);
int nedge = istart % alignb; int nedge = istart % alignb;
if (nedge) istart + (alignb - nedge); if (nedge) istart + (alignb - nedge);
itx = tx + istart; itx = tx + istart;
ity = ty + istart; ity = ty + istart;
itz = tz + istart; itz = tz + istart;
itj = tj + istart; itj = tj + istart;
itjtype = tjtype + istart; itjtype = tjtype + istart;
const int bstart = binhead[ibin]; const int bstart = binhead[ibin];
const int bend = binhead[ibin + 1]; const int bend = binhead[ibin + 1];
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma simd #pragma simd
#endif #endif
for (int jj = bstart; jj < bend; jj++) { for (int jj = bstart; jj < bend; jj++) {
const int j = binpacked[jj]; const int j = binpacked[jj];
itj[icount] = j; itj[icount] = j;
itx[icount] = x[j].x; itx[icount] = x[j].x;
ity[icount] = x[j].y; ity[icount] = x[j].y;
itz[icount] = x[j].z; itz[icount] = x[j].z;
itjtype[icount] = x[j].w; itjtype[icount] = x[j].w;
icount++; icount++;
} }
if (icount + istart > obound) *overflow = 1; if (icount + istart > obound) *overflow = 1;
} else } else
if (ncount > obound) *overflow = 1; if (ncount > obound) *overflow = 1;
} }
// ---------------------- Loop over i bin // ---------------------- Loop over i bin
int n = 0; int n = 0;
if (FULL == 0 || TRI == 1) { if (FULL == 0 || TRI == 1) {
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma ivdep #pragma ivdep
#endif #endif
for (int u = 0; u < icount; u++) { for (int u = 0; u < icount; u++) {
int addme = 1; int addme = 1;
int j = itj[u]; int j = itj[u];
// Cutoff Check // Cutoff Check
const flt_t delx = xtmp - itx[u]; const flt_t delx = xtmp - itx[u];
const flt_t dely = ytmp - ity[u]; const flt_t dely = ytmp - ity[u];
const flt_t delz = ztmp - itz[u]; const flt_t delz = ztmp - itz[u];
const int jtype = itjtype[u]; const int jtype = itjtype[u];
const flt_t rsq = delx * delx + dely * dely + delz * delz; const flt_t rsq = delx * delx + dely * dely + delz * delz;
if (rsq > cutneighsq[ioffset + jtype]) addme = 0; if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
// i bin (half) check and offload ghost check // i bin (half) check and offload ghost check
if (j < nlocal) { if (j < nlocal) {
const int ijmod = (i + j) % 2; const int ijmod = (i + j) % 2;
if (i > j) { if (i > j) {
if (ijmod == 0) addme = 0; if (ijmod == 0) addme = 0;
} else if (i < j) { } else if (i < j) {
if (ijmod == 1) addme = 0; if (ijmod == 1) addme = 0;
} else } else
addme = 0; addme = 0;
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost && i < offload_end) addme = 0; if (offload_noghost && i < offload_end) addme = 0;
#endif #endif
} else { } else {
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost && offload) addme = 0; if (offload_noghost && offload) addme = 0;
#endif #endif
if (itz[u] < ztmp) addme = 0; if (itz[u] < ztmp) addme = 0;
if (itz[u] == ztmp) { if (itz[u] == ztmp) {
if (ity[u] < ytmp) addme = 0; if (ity[u] < ytmp) addme = 0;
if (ity[u] == ytmp && itx[u] < xtmp) addme = 0; if (ity[u] == ytmp && itx[u] < xtmp) addme = 0;
} }
} }
if (need_ic) { if (need_ic) {
int no_special; int no_special;
ominimum_image_check(no_special, delx, dely, delz); ominimum_image_check(no_special, delx, dely, delz);
if (no_special) if (no_special)
j = -j - 1; j = -j - 1;
} }
if (addme) if (addme)
neighptr[n++] = j; neighptr[n++] = j;
} }
} // if FULL==0 } // if FULL==0
// ---------------------- Loop over other bins // ---------------------- Loop over other bins
int n2, *neighptr2; int n2, *neighptr2;
if (THREE) { if (THREE) {
n = pack_offset; n = pack_offset;
n2 = pack_offset + maxnbors; n2 = pack_offset + maxnbors;
neighptr2 = neighptr; neighptr2 = neighptr;
} }
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma ivdep #pragma ivdep
#endif #endif
for (int u = 0; u < ncount; u++) { for (int u = 0; u < ncount; u++) {
int addme = 1; int addme = 1;
int j = tj[u]; int j = tj[u];
if (FULL) if (FULL)
if (i == j) addme = 0; if (i == j) addme = 0;
// Cutoff Check // Cutoff Check
const flt_t delx = xtmp - tx[u]; const flt_t delx = xtmp - tx[u];
const flt_t dely = ytmp - ty[u]; const flt_t dely = ytmp - ty[u];
const flt_t delz = ztmp - tz[u]; const flt_t delz = ztmp - tz[u];
const int jtype = tjtype[u]; const int jtype = tjtype[u];
const flt_t rsq = delx * delx + dely * dely + delz * delz; const flt_t rsq = delx * delx + dely * dely + delz * delz;
if (rsq > cutneighsq[ioffset + jtype]) addme = 0; if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
// Triclinic // Triclinic
if (TRI) { if (TRI) {
if (tz[u] < ztmp) addme = 0; if (tz[u] < ztmp) addme = 0;
if (tz[u] == ztmp) { if (tz[u] == ztmp) {
if (ty[u] < ytmp) addme = 0; if (ty[u] < ytmp) addme = 0;
if (ty[u] == ytmp) { if (ty[u] == ytmp) {
if (tx[u] < xtmp) addme = 0; if (tx[u] < xtmp) addme = 0;
if (tx[u] == xtmp && j <= i) addme = 0; if (tx[u] == xtmp && j <= i) addme = 0;
} }
} }
} }
// offload ghost check // offload ghost check
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) { if (offload_noghost) {
if (j < nlocal) { if (j < nlocal) {
if (i < offload_end) addme = 0; if (i < offload_end) addme = 0;
} else if (offload) addme = 0; } else if (offload) addme = 0;
} }
#endif #endif
int pj; int pj;
if (THREE) pj = j; if (THREE) pj = j;
if (need_ic) { if (need_ic) {
int no_special; int no_special;
ominimum_image_check(no_special, delx, dely, delz); ominimum_image_check(no_special, delx, dely, delz);
if (no_special) if (no_special)
j = -j - 1; j = -j - 1;
} }
if (THREE) { if (THREE) {
const int jtag = tag[pj]; const int jtag = tag[pj];
int flist = 0; int flist = 0;
if (itag > jtag) { if (itag > jtag) {
if ((itag+jtag) % 2 == 0) flist = 1; if ((itag+jtag) % 2 == 0) flist = 1;
} else if (itag < jtag) { } else if (itag < jtag) {
if ((itag+jtag) % 2 == 1) flist = 1; if ((itag+jtag) % 2 == 1) flist = 1;
} else { } else {
if (tz[u] < ztmp) flist = 1; if (tz[u] < ztmp) flist = 1;
else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1; else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp) else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp)
flist = 1; flist = 1;
} }
if (addme) { if (addme) {
if (flist) if (flist)
neighptr2[n2++] = j; neighptr2[n2++] = j;
else else
neighptr[n++] = j; neighptr[n++] = j;
} }
} else { } else {
if (addme) if (addme)
neighptr[n++] = j; neighptr[n++] = j;
} }
} // for u } // for u
#ifndef _LMP_INTEL_OFFLOAD #ifndef _LMP_INTEL_OFFLOAD
if (exclude) { if (exclude) {
int alln = n; int alln = n;
if (THREE) n = pack_offset; if (THREE) n = pack_offset;
else n = 0; else n = 0;
for (int u = pack_offset; u < alln; u++) { for (int u = pack_offset; u < alln; u++) {
const int j = neighptr[u]; const int j = neighptr[u];
int pj = j; int pj = j;
if (need_ic) if (need_ic)
if (pj < 0) pj = -j - 1; if (pj < 0) pj = -j - 1;
const int jtype = x[pj].w; const int jtype = x[pj].w;
if (exclusion(i,pj,itype,jtype,mask,molecule)) continue; if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
neighptr[n++] = j; neighptr[n++] = j;
}
if (THREE) {
alln = n2;
n2 = pack_offset + maxnbors;
for (int u = pack_offset + maxnbors; u < alln; u++) {
const int j = neighptr[u];
int pj = j;
if (need_ic)
if (pj < 0) pj = -j - 1;
const int jtype = x[pj].w;
if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
neighptr[n2++] = j;
}
} }
if (THREE) {
alln = n2;
n2 = pack_offset + maxnbors;
for (int u = pack_offset + maxnbors; u < alln; u++) {
const int j = neighptr[u];
int pj = j;
if (need_ic)
if (pj < 0) pj = -j - 1;
const int jtype = x[pj].w;
if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
neighptr[n2++] = j;
}
}
} }
#endif #endif
int ns; int ns;
if (THREE) { if (THREE) {
int alln = n; int alln = n;
ns = n - pack_offset; ns = n - pack_offset;
atombin[i] = ns; atombin[i] = ns;
n = lane; n = lane;
for (int u = pack_offset; u < alln; u++) { for (int u = pack_offset; u < alln; u++) {
neighptr[n] = neighptr[u]; neighptr[n] = neighptr[u];
n += pack_width; n += pack_width;
} }
ns += n2 - pack_offset - maxnbors; ns += n2 - pack_offset - maxnbors;
for (int u = pack_offset + maxnbors; u < n2; u++) { for (int u = pack_offset + maxnbors; u < n2; u++) {
neighptr[n] = neighptr[u]; neighptr[n] = neighptr[u];
n += pack_width; n += pack_width;
} }
if (ns > maxnbors) *overflow = 1; if (ns > maxnbors) *overflow = 1;
} else } else
if (n > maxnbors) *overflow = 1; if (n > maxnbors) *overflow = 1;
ilist[i] = i; ilist[i] = i;
cnumneigh[i] = ct; cnumneigh[i] = ct;
if (THREE) { if (THREE) {
cnumneigh[i] += lane; cnumneigh[i] += lane;
numneigh[i] = ns; numneigh[i] = ns;
} else { } else {
int edge = (n % pad_width); int edge = (n % pad_width);
if (edge) { if (edge) {
const int pad_end = n + (pad_width - edge); const int pad_end = n + (pad_width - edge);
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \ #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
avg=INTEL_COMPILE_WIDTH/2 avg=INTEL_COMPILE_WIDTH/2
#endif #endif
for ( ; n < pad_end; n++) for ( ; n < pad_end; n++)
neighptr[n] = e_nall; neighptr[n] = e_nall;
} }
numneigh[i] = n; numneigh[i] = n;
} }
if (THREE) { if (THREE) {
if (ns > max_chunk) max_chunk = ns; if (ns > max_chunk) max_chunk = ns;
lane++; lane++;
if (lane == pack_width) { if (lane == pack_width) {
ct += max_chunk * pack_width; ct += max_chunk * pack_width;
const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
const int edge = (ct % alignb); const int edge = (ct % alignb);
if (edge) ct += alignb - edge; if (edge) ct += alignb - edge;
neighptr = firstneigh + ct; neighptr = firstneigh + ct;
max_chunk = 0; max_chunk = 0;
pack_offset = maxnbors * pack_width; pack_offset = maxnbors * pack_width;
lane = 0; lane = 0;
if (ct + obound > list_size) { if (ct + obound > list_size) {
if (i < ito - 1) { if (i < ito - 1) {
*overflow = 1; *overflow = 1;
ct = (ifrom + tid * 2) * maxnbors; ct = (ifrom + tid * 2) * maxnbors;
} }
} }
} }
} else { } else {
ct += n; ct += n;
const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
const int edge = (ct % alignb); const int edge = (ct % alignb);
if (edge) ct += alignb - edge; if (edge) ct += alignb - edge;
neighptr = firstneigh + ct; neighptr = firstneigh + ct;
if (ct + obound > list_size) { if (ct + obound > list_size) {
if (i < ito - 1) { if (i < ito - 1) {
*overflow = 1; *overflow = 1;
ct = (ifrom + tid * 2) * maxnbors; ct = (ifrom + tid * 2) * maxnbors;
} }
} }
} }
} }
if (*overflow == 1) if (*overflow == 1)
@ -568,16 +568,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax; int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
int ghost_offset = 0, nall_offset = e_nall; int ghost_offset = 0, nall_offset = e_nall;
if (separate_buffers) { if (separate_buffers) {
for (int i = ifrom; i < ito; ++i) { for (int i = ifrom; i < ito; ++i) {
int * _noalias jlist = firstneigh + cnumneigh[i]; int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i]; const int jnum = numneigh[i];
#if __INTEL_COMPILER+0 > 1499 #if __INTEL_COMPILER+0 > 1499
#pragma vector aligned #pragma vector aligned
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin) #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
#endif #endif
for (int jj = 0; jj < jnum; jj++) { for (int jj = 0; jj < jnum; jj++) {
int j = jlist[jj]; int j = jlist[jj];
if (need_ic && j < 0) j = -j - 1; if (need_ic && j < 0) j = -j - 1;
if (j < nlocal) { if (j < nlocal) {
if (j < vlmin) vlmin = j; if (j < vlmin) vlmin = j;
if (j > vlmax) vlmax = j; if (j > vlmax) vlmax = j;
@ -585,33 +585,33 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
if (j < vgmin) vgmin = j; if (j < vgmin) vgmin = j;
if (j > vgmax) vgmax = j; if (j > vgmax) vgmax = j;
} }
} }
} }
lmin = MIN(lmin,vlmin); lmin = MIN(lmin,vlmin);
gmin = MIN(gmin,vgmin); gmin = MIN(gmin,vgmin);
lmax = MAX(lmax,vlmax); lmax = MAX(lmax,vlmax);
gmax = MAX(gmax,vgmax); gmax = MAX(gmax,vgmax);
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp critical #pragma omp critical
#endif #endif
{ {
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin; if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax; if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin; if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax; if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
}
#pragma omp barrier
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
if (nghost < 0) nghost = 0;
if (offload) {
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
} else {
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
nall_offset = nlocal + nghost;
} }
#pragma omp barrier
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
if (nghost < 0) nghost = 0;
if (offload) {
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
} else {
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
nall_offset = nlocal + nghost;
}
} // if separate_buffers } // if separate_buffers
#endif #endif
@ -620,67 +620,67 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
int * _noalias jlist = firstneigh + cnumneigh[i]; int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i]; const int jnum = numneigh[i];
if (THREE) { if (THREE) {
const int trip = jnum * pack_width; const int trip = jnum * pack_width;
for (int jj = 0; jj < trip; jj+=pack_width) { for (int jj = 0; jj < trip; jj+=pack_width) {
const int j = jlist[jj]; const int j = jlist[jj];
if (need_ic && j < 0) { if (need_ic && j < 0) {
which = 0; which = 0;
jlist[jj] = -j - 1; jlist[jj] = -j - 1;
} else } else
ofind_special(which, special, nspecial, i, tag[j]); ofind_special(which, special, nspecial, i, tag[j]);
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
if (j >= nlocal) { if (j >= nlocal) {
if (j == e_nall) if (j == e_nall)
jlist[jj] = nall_offset; jlist[jj] = nall_offset;
else if (which) else if (which)
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
else jlist[jj]-=ghost_offset; else jlist[jj]-=ghost_offset;
} else } else
#endif #endif
if (which) jlist[jj] = j ^ (which << SBBITS);
}
} else {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = 0; jj < jnum; jj++) {
const int j = jlist[jj];
if (need_ic && j < 0) {
which = 0;
jlist[jj] = -j - 1;
} else
ofind_special(which, special, nspecial, i, tag[j]);
#ifdef _LMP_INTEL_OFFLOAD
if (j >= nlocal) {
if (j == e_nall)
jlist[jj] = nall_offset;
else if (which)
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
else jlist[jj]-=ghost_offset;
} else
#endif
if (which) jlist[jj] = j ^ (which << SBBITS); if (which) jlist[jj] = j ^ (which << SBBITS);
} }
} } else {
} // for i #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = 0; jj < jnum; jj++) {
const int j = jlist[jj];
if (need_ic && j < 0) {
which = 0;
jlist[jj] = -j - 1;
} else
ofind_special(which, special, nspecial, i, tag[j]);
#ifdef _LMP_INTEL_OFFLOAD
if (j >= nlocal) {
if (j == e_nall)
jlist[jj] = nall_offset;
else if (which)
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
else jlist[jj]-=ghost_offset;
} else
#endif
if (which) jlist[jj] = j ^ (which << SBBITS);
}
}
} // for i
} // if molecular } // if molecular
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
else if (separate_buffers) { else if (separate_buffers) {
for (int i = ifrom; i < ito; ++i) { for (int i = ifrom; i < ito; ++i) {
int * _noalias jlist = firstneigh + cnumneigh[i]; int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i]; const int jnum = numneigh[i];
int jj = 0; int jj = 0;
#pragma vector aligned #pragma vector aligned
#pragma simd #pragma simd
for (jj = 0; jj < jnum; jj++) { for (jj = 0; jj < jnum; jj++) {
if (jlist[jj] >= nlocal) { if (jlist[jj] >= nlocal) {
if (jlist[jj] == e_nall) jlist[jj] = nall_offset; if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
else jlist[jj] -= ghost_offset; else jlist[jj] -= ghost_offset;
} }
} }
} }
} }
#endif #endif
} // end omp } // end omp
@ -704,9 +704,9 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
_fix->start_watch(TIME_PACK); _fix->start_watch(TIME_PACK);
_fix->set_neighbor_host_sizes(); _fix->set_neighbor_host_sizes();
buffers->pack_sep_from_single(_fix->host_min_local(), buffers->pack_sep_from_single(_fix->host_min_local(),
_fix->host_used_local(), _fix->host_used_local(),
_fix->host_min_ghost(), _fix->host_min_ghost(),
_fix->host_used_ghost()); _fix->host_used_ghost());
_fix->stop_watch(TIME_PACK); _fix->stop_watch(TIME_PACK);
} }
} }
@ -732,9 +732,9 @@ void NPairIntel::grow_stencil()
_off_map_stencil = stencil; _off_map_stencil = stencil;
const int * stencil = _off_map_stencil; const int * stencil = _off_map_stencil;
const int maxstencil = ns->get_maxstencil(); const int maxstencil = ns->get_maxstencil();
#pragma offload_transfer target(mic:_cop) \ #pragma offload_transfer target(mic:_cop) \
in(stencil:length(maxstencil) alloc_if(1) free_if(0)) in(stencil:length(maxstencil) alloc_if(1) free_if(0))
} }
} }
#endif #endif

View File

@ -84,8 +84,8 @@ class NPairIntel : public NPair {
FixIntel *_fix; FixIntel *_fix;
template <class flt_t, class acc_t, int, int, int, int, int> template <class flt_t, class acc_t, int, int, int, int, int>
void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *,
const int, const int, const int offload_end = 0); const int, const int, const int offload_end = 0);
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
int _cop; int _cop;

View File

@ -55,7 +55,7 @@ PairBuckCoulCutIntel::~PairBuckCoulCutIntel()
void PairBuckCoulCutIntel::compute(int eflag, int vflag) void PairBuckCoulCutIntel::compute(int eflag, int vflag)
{ {
if (fix->precision()==FixIntel::PREC_MODE_MIXED) if (fix->precision()==FixIntel::PREC_MODE_MIXED)
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
force_const_single); force_const_single);
else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
compute<double,double>(eflag, vflag, fix->get_double_buffers(), compute<double,double>(eflag, vflag, fix->get_double_buffers(),
@ -70,8 +70,8 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag)
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void PairBuckCoulCutIntel::compute(int eflag, int vflag, void PairBuckCoulCutIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
if (eflag || vflag) { if (eflag || vflag) {
ev_setup(eflag,vflag); ev_setup(eflag,vflag);
@ -94,13 +94,13 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag,
#endif #endif
{ {
int ifrom, ito, tid; int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
packthreads, sizeof(ATOM_T)); packthreads, sizeof(ATOM_T));
buffers->thr_pack(ifrom,ito,ago); buffers->thr_pack(ifrom,ito,ago);
} }
fix->stop_watch(TIME_PACK); fix->stop_watch(TIME_PACK);
} }
int ovflag = 0; int ovflag = 0;
if (vflag_fdotr) ovflag = 2; if (vflag_fdotr) ovflag = 2;
else if (vflag) ovflag = 1; else if (vflag) ovflag = 1;
@ -127,9 +127,9 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag,
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void PairBuckCoulCutIntel::eval(const int offload, const int vflag, void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc, const ForceConst<flt_t> &fc,
const int astart, const int aend) const int astart, const int aend)
{ {
const int inum = aend - astart; const int inum = aend - astart;
if (inum == 0) return; if (inum == 0) return;
@ -160,8 +160,8 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
// Determine how much data to transfer // Determine how much data to transfer
int x_size, q_size, f_stride, ev_size, separate_flag; int x_size, q_size, f_stride, ev_size, separate_flag;
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
buffers, offload, fix, separate_flag, buffers, offload, fix, separate_flag,
x_size, q_size, ev_size, f_stride); x_size, q_size, ev_size, f_stride);
int tc; int tc;
FORCE_T * _noalias f_start; FORCE_T * _noalias f_start;
@ -198,8 +198,8 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
*timer_compute = MIC_Wtime(); *timer_compute = MIC_Wtime();
#endif #endif
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
f_stride, x, q); f_stride, x, q);
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = oecoul = (acc_t)0; if (EFLAG) oevdwl = oecoul = (acc_t)0;
@ -233,20 +233,20 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
acc_t fxtmp,fytmp,fztmp,fwtmp; acc_t fxtmp,fytmp,fztmp,fwtmp;
acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
const flt_t xtmp = x[i].x; const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y; const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z; const flt_t ztmp = x[i].z;
const flt_t qtmp = q[i]; const flt_t qtmp = q[i];
fxtmp = fytmp = fztmp = (acc_t)0; fxtmp = fytmp = fztmp = (acc_t)0;
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
if (NEWTON_PAIR == 0) if (NEWTON_PAIR == 0)
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5) sv0, sv1, sv2, sv3, sv4, sv5)
#endif #endif
for (int jj = 0; jj < jnum; jj++) { for (int jj = 0; jj < jnum; jj++) {
flt_t forcecoul, forcebuck, evdwl, ecoul; flt_t forcecoul, forcebuck, evdwl, ecoul;
@ -262,19 +262,19 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
const flt_t rsq = delx * delx + dely * dely + delz * delz; const flt_t rsq = delx * delx + dely * dely + delz * delz;
const flt_t r = sqrt(rsq); const flt_t r = sqrt(rsq);
const flt_t r2inv = (flt_t)1.0 / rsq; const flt_t r2inv = (flt_t)1.0 / rsq;
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
if (rsq < c_cuti[jtype].cut_coulsq) { if (rsq < c_cuti[jtype].cut_coulsq) {
#endif #endif
forcecoul = qqrd2e * qtmp*q[j]/r; forcecoul = qqrd2e * qtmp*q[j]/r;
if (EFLAG) if (EFLAG)
ecoul = forcecoul; ecoul = forcecoul;
if (sbindex){ if (sbindex){
const flt_t factor_coul = special_coul[sbindex]; const flt_t factor_coul = special_coul[sbindex];
forcecoul *= factor_coul; forcecoul *= factor_coul;
if(EFLAG) if(EFLAG)
ecoul *= factor_coul; ecoul *= factor_coul;
} }
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
} }
@ -282,7 +282,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
if (rsq >= c_cuti[jtype].cut_coulsq) if (rsq >= c_cuti[jtype].cut_coulsq)
{ forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; } { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
#endif #endif
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
if (rsq < c_cuti[jtype].cut_ljsq) { if (rsq < c_cuti[jtype].cut_ljsq) {
#endif #endif
@ -290,14 +290,14 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
flt_t rexp = exp(-r * c_forcei[jtype].rhoinv); flt_t rexp = exp(-r * c_forcei[jtype].rhoinv);
forcebuck = r * rexp * c_forcei[jtype].buck1 - forcebuck = r * rexp * c_forcei[jtype].buck1 -
r6inv * c_forcei[jtype].buck2; r6inv * c_forcei[jtype].buck2;
if (EFLAG) if (EFLAG)
evdwl = rexp * c_energyi[jtype].a - evdwl = rexp * c_energyi[jtype].a -
r6inv * c_energyi[jtype].c - r6inv * c_energyi[jtype].c -
c_energyi[jtype].offset; c_energyi[jtype].offset;
if (sbindex) { if (sbindex) {
const flt_t factor_lj = special_lj[sbindex]; const flt_t factor_lj = special_lj[sbindex];
forcebuck *= factor_lj; forcebuck *= factor_lj;
if (EFLAG) if (EFLAG)
evdwl *= factor_lj; evdwl *= factor_lj;
} }
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
@ -311,51 +311,51 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
if (rsq < c_cuti[jtype].cutsq) { if (rsq < c_cuti[jtype].cutsq) {
#endif #endif
const flt_t fpair = (forcecoul + forcebuck) * r2inv; const flt_t fpair = (forcecoul + forcebuck) * r2inv;
const flt_t fpx = fpair * delx; const flt_t fpx = fpair * delx;
fxtmp += fpx; fxtmp += fpx;
if (NEWTON_PAIR) f[j].x -= fpx; if (NEWTON_PAIR) f[j].x -= fpx;
const flt_t fpy = fpair * dely; const flt_t fpy = fpair * dely;
fytmp += fpy; fytmp += fpy;
if (NEWTON_PAIR) f[j].y -= fpy; if (NEWTON_PAIR) f[j].y -= fpy;
const flt_t fpz = fpair * delz; const flt_t fpz = fpair * delz;
fztmp += fpz; fztmp += fpz;
if (NEWTON_PAIR) f[j].z -= fpz; if (NEWTON_PAIR) f[j].z -= fpz;
if (EFLAG) { if (EFLAG) {
sevdwl += evdwl; sevdwl += evdwl;
secoul += ecoul; secoul += ecoul;
if (eatom) { if (eatom) {
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
if (NEWTON_PAIR) if (NEWTON_PAIR)
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
} }
} }
if (NEWTON_PAIR == 0) if (NEWTON_PAIR == 0)
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz); IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
} }
#endif #endif
} // for jj } // for jj
if (NEWTON_PAIR) { if (NEWTON_PAIR) {
f[i].x += fxtmp; f[i].x += fxtmp;
f[i].y += fytmp; f[i].y += fytmp;
f[i].z += fztmp; f[i].z += fztmp;
} else { } else {
f[i].x = fxtmp; f[i].x = fxtmp;
f[i].y = fytmp; f[i].y = fytmp;
f[i].z = fztmp; f[i].z = fztmp;
} }
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
} // for ii } // for ii
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
ov4, ov5); ov4, ov5);
} // end of omp parallel region } // end of omp parallel region
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5); ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) { if (EFLAG) {
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
@ -364,12 +364,12 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
} }
if (vflag) { if (vflag) {
if (NEWTON_PAIR == 0) { if (NEWTON_PAIR == 0) {
ov0 *= (acc_t)0.5; ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5; ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5; ov2 *= (acc_t)0.5;
ov3 *= (acc_t)0.5; ov3 *= (acc_t)0.5;
ov4 *= (acc_t)0.5; ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5; ov5 *= (acc_t)0.5;
} }
ev_global[2] = ov0; ev_global[2] = ov0;
ev_global[3] = ov1; ev_global[3] = ov1;
@ -410,7 +410,7 @@ void PairBuckCoulCutIntel::init_style()
error->all(FLERR, error->all(FLERR,
"The 'package intel' command is required for /intel styles"); "The 'package intel' command is required for /intel styles");
fix = static_cast<FixIntel *>(modify->fix[ifix]); fix = static_cast<FixIntel *>(modify->fix[ifix]);
fix->pair_init_check(); fix->pair_init_check();
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
_cop = fix->coprocessor_number(); _cop = fix->coprocessor_number();
@ -492,9 +492,9 @@ void PairBuckCoulCutIntel::pack_force_const(ForceConst<flt_t> &fc,
template <class flt_t> template <class flt_t>
void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
const int ntable, const int ntable,
Memory *memory, Memory *memory,
const int cop) { const int cop) {
if ( (ntypes != _ntypes || ntable != _ntable) ) { if ( (ntypes != _ntypes || ntable != _ntable) ) {
if (_ntypes > 0) { if (_ntypes > 0) {
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
@ -505,12 +505,12 @@ void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
c_cut_t * oc_cut = c_cut[0]; c_cut_t * oc_cut = c_cut[0];
if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL && if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL &&
oc_energy != NULL && ospecial_coul != NULL && oc_energy != NULL && ospecial_coul != NULL &&
_cop >= 0) { _cop >= 0) {
#pragma offload_transfer target(mic:cop) \ #pragma offload_transfer target(mic:cop) \
nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \ nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \ nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
nocopy(oc_cut: alloc_if(0) free_if(1)) nocopy(oc_cut: alloc_if(0) free_if(1))
} }
#endif #endif
@ -534,7 +534,7 @@ void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
c_cut_t * oc_cut = c_cut[0]; c_cut_t * oc_cut = c_cut[0];
int tp1sq = ntypes*ntypes; int tp1sq = ntypes*ntypes;
if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL && if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL &&
oc_energy != NULL && ospecial_coul != NULL && oc_energy != NULL && ospecial_coul != NULL &&
cop >= 0) { cop >= 0) {
#pragma offload_transfer target(mic:cop) \ #pragma offload_transfer target(mic:cop) \
nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \ nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \

View File

@ -51,8 +51,8 @@ class PairBuckCoulCutIntel : public PairBuckCoulCut {
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void eval(const int offload, const int vflag, void eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> * buffers, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc, const int astart, const int aend); const ForceConst<flt_t> &fc, const int astart, const int aend);
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc, void pack_force_const(ForceConst<flt_t> &fc,
@ -75,7 +75,7 @@ class PairBuckCoulCutIntel : public PairBuckCoulCut {
~ForceConst() { set_ntypes(0,0,NULL,_cop); } ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
void set_ntypes(const int ntypes, const int ntable, Memory *memory, void set_ntypes(const int ntypes, const int ntable, Memory *memory,
const int cop); const int cop);
private: private:
int _ntypes, _ntable, _cop; int _ntypes, _ntable, _cop;

View File

@ -55,7 +55,7 @@ PairBuckCoulLongIntel::~PairBuckCoulLongIntel()
void PairBuckCoulLongIntel::compute(int eflag, int vflag) void PairBuckCoulLongIntel::compute(int eflag, int vflag)
{ {
if (fix->precision()==FixIntel::PREC_MODE_MIXED) if (fix->precision()==FixIntel::PREC_MODE_MIXED)
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
force_const_single); force_const_single);
else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
compute<double,double>(eflag, vflag, fix->get_double_buffers(), compute<double,double>(eflag, vflag, fix->get_double_buffers(),
@ -70,8 +70,8 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag)
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void PairBuckCoulLongIntel::compute(int eflag, int vflag, void PairBuckCoulLongIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
if (eflag || vflag) { if (eflag || vflag) {
ev_setup(eflag,vflag); ev_setup(eflag,vflag);
@ -85,7 +85,7 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag,
if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) { if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
fix->start_watch(TIME_PACK); fix->start_watch(TIME_PACK);
int packthreads; int packthreads;
if (nthreads > INTEL_HTHREADS) packthreads = nthreads; if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
else packthreads = 1; else packthreads = 1;
@ -94,13 +94,13 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag,
#endif #endif
{ {
int ifrom, ito, tid; int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
packthreads, sizeof(ATOM_T)); packthreads, sizeof(ATOM_T));
buffers->thr_pack(ifrom,ito,ago); buffers->thr_pack(ifrom,ito,ago);
} }
fix->stop_watch(TIME_PACK); fix->stop_watch(TIME_PACK);
} }
int ovflag = 0; int ovflag = 0;
if (vflag_fdotr) ovflag = 2; if (vflag_fdotr) ovflag = 2;
else if (vflag) ovflag = 1; else if (vflag) ovflag = 1;
@ -127,9 +127,9 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag,
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void PairBuckCoulLongIntel::eval(const int offload, const int vflag, void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc, const ForceConst<flt_t> &fc,
const int astart, const int aend) const int astart, const int aend)
{ {
const int inum = aend - astart; const int inum = aend - astart;
if (inum == 0) return; if (inum == 0) return;
@ -175,8 +175,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
// Determine how much data to transfer // Determine how much data to transfer
int x_size, q_size, f_stride, ev_size, separate_flag; int x_size, q_size, f_stride, ev_size, separate_flag;
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
buffers, offload, fix, separate_flag, buffers, offload, fix, separate_flag,
x_size, q_size, ev_size, f_stride); x_size, q_size, ev_size, f_stride);
int tc; int tc;
FORCE_T * _noalias f_start; FORCE_T * _noalias f_start;
@ -213,7 +213,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \ in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \ in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \ in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \ in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \ out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \ out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \ out(timer_compute:length(1) alloc_if(0) free_if(0)) \
@ -224,8 +224,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
*timer_compute = MIC_Wtime(); *timer_compute = MIC_Wtime();
#endif #endif
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
f_stride, x, q); f_stride, x, q);
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = oecoul = (acc_t)0; if (EFLAG) oevdwl = oecoul = (acc_t)0;
@ -260,24 +260,24 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
const int ptr_off = itype * ntypes; const int ptr_off = itype * ntypes;
const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off; const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off; const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
const flt_t * _noalias const rho_invi = rho_inv + ptr_off; const flt_t * _noalias const rho_invi = rho_inv + ptr_off;
const int * _noalias const jlist = firstneigh + cnumneigh[i]; const int * _noalias const jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i]; const int jnum = numneigh[i];
acc_t fxtmp,fytmp,fztmp,fwtmp; acc_t fxtmp,fytmp,fztmp,fwtmp;
acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
const flt_t xtmp = x[i].x; const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y; const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z; const flt_t ztmp = x[i].z;
const flt_t qtmp = q[i]; const flt_t qtmp = q[i];
fxtmp = fytmp = fztmp = (acc_t)0; fxtmp = fytmp = fztmp = (acc_t)0;
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
if (NEWTON_PAIR == 0) if (NEWTON_PAIR == 0)
if (vflag == 1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; if (vflag == 1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
int ej = 0; int ej = 0;
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma ivdep #pragma ivdep
@ -287,33 +287,33 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
const flt_t delx = xtmp - x[j].x; const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y; const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z; const flt_t delz = ztmp - x[j].z;
const int jtype = x[j].w; const int jtype = x[j].w;
const flt_t rsq = delx * delx + dely * dely + delz * delz; const flt_t rsq = delx * delx + dely * dely + delz * delz;
if (rsq < c_forcei[jtype].cutsq) { if (rsq < c_forcei[jtype].cutsq) {
trsq[ej]=rsq; trsq[ej]=rsq;
tdelx[ej]=delx; tdelx[ej]=delx;
tdely[ej]=dely; tdely[ej]=dely;
tdelz[ej]=delz; tdelz[ej]=delz;
tjtype[ej]=jtype; tjtype[ej]=jtype;
tj[ej]=jlist[jj]; tj[ej]=jlist[jj];
ej++; ej++;
} }
} }
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
sv0, sv1, sv2, sv3, sv4, sv5) sv0, sv1, sv2, sv3, sv4, sv5)
#endif #endif
for (int jj = 0; jj < ej; jj++) { for (int jj = 0; jj < ej; jj++) {
flt_t forcecoul, forcebuck, evdwl, ecoul; flt_t forcecoul, forcebuck, evdwl, ecoul;
forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0; forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0;
const int j = tj[jj] & NEIGHMASK; const int j = tj[jj] & NEIGHMASK;
const int sbindex = tj[jj] >> SBBITS & 3; const int sbindex = tj[jj] >> SBBITS & 3;
const int jtype = tjtype[jj]; const int jtype = tjtype[jj];
const flt_t rsq = trsq[jj]; const flt_t rsq = trsq[jj];
const flt_t r2inv = (flt_t)1.0 / rsq; const flt_t r2inv = (flt_t)1.0 / rsq;
const flt_t r = (flt_t)1.0 / sqrt(r2inv); const flt_t r = (flt_t)1.0 / sqrt(r2inv);
@ -321,52 +321,52 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
if (!ncoultablebits || rsq <= tabinnersq) { if (!ncoultablebits || rsq <= tabinnersq) {
#endif #endif
const flt_t A1 = 0.254829592; const flt_t A1 = 0.254829592;
const flt_t A2 = -0.284496736; const flt_t A2 = -0.284496736;
const flt_t A3 = 1.421413741; const flt_t A3 = 1.421413741;
const flt_t A4 = -1.453152027; const flt_t A4 = -1.453152027;
const flt_t A5 = 1.061405429; const flt_t A5 = 1.061405429;
const flt_t EWALD_F = 1.12837917; const flt_t EWALD_F = 1.12837917;
const flt_t INV_EWALD_P = 1.0 / 0.3275911; const flt_t INV_EWALD_P = 1.0 / 0.3275911;
const flt_t grij = g_ewald * r; const flt_t grij = g_ewald * r;
const flt_t expm2 = exp(-grij * grij); const flt_t expm2 = exp(-grij * grij);
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
const flt_t prefactor = qqrd2e * qtmp * q[j] / r; const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
if (EFLAG) ecoul = prefactor * erfc; if (EFLAG) ecoul = prefactor * erfc;
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
prefactor;
forcecoul -= adjust;
if (EFLAG) ecoul -= adjust;
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
prefactor;
forcecoul -= adjust;
if (EFLAG) ecoul -= adjust;
#ifdef INTEL_ALLOW_TABLE #ifdef INTEL_ALLOW_TABLE
} else { } else {
float rsq_lookup = rsq; float rsq_lookup = rsq;
const int itable = (__intel_castf32_u32(rsq_lookup) & const int itable = (__intel_castf32_u32(rsq_lookup) &
ncoulmask) >> ncoulshiftbits; ncoulmask) >> ncoulshiftbits;
const flt_t fraction = (rsq_lookup - table[itable].r) * const flt_t fraction = (rsq_lookup - table[itable].r) *
table[itable].dr; table[itable].dr;
const flt_t tablet = table[itable].f + const flt_t tablet = table[itable].f +
fraction * table[itable].df; fraction * table[itable].df;
forcecoul = qtmp * q[j] * tablet; forcecoul = qtmp * q[j] * tablet;
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] + if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
fraction * detable[itable]); fraction * detable[itable]);
if (sbindex) { if (sbindex) {
const flt_t table2 = ctable[itable] + const flt_t table2 = ctable[itable] +
fraction * dctable[itable]; fraction * dctable[itable];
const flt_t prefactor = qtmp * q[j] * table2; const flt_t prefactor = qtmp * q[j] * table2;
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) * const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
prefactor; prefactor;
forcecoul -= adjust; forcecoul -= adjust;
if (EFLAG) ecoul -= adjust; if (EFLAG) ecoul -= adjust;
} }
} }
#endif #endif
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
if (rsq < c_forcei[jtype].cut_ljsq) { if (rsq < c_forcei[jtype].cut_ljsq) {
#endif #endif
flt_t r6inv = r2inv * r2inv * r2inv; flt_t r6inv = r2inv * r2inv * r2inv;
@ -389,7 +389,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
{ forcebuck = (flt_t)0.0; evdwl = (flt_t)0.0; } { forcebuck = (flt_t)0.0; evdwl = (flt_t)0.0; }
#endif #endif
const flt_t fpair = (forcecoul + forcebuck) * r2inv; const flt_t fpair = (forcecoul + forcebuck) * r2inv;
const flt_t fpx = fpair * tdelx[jj]; const flt_t fpx = fpair * tdelx[jj];
fxtmp += fpx; fxtmp += fpx;
if (NEWTON_PAIR) f[j].x -= fpx; if (NEWTON_PAIR) f[j].x -= fpx;
@ -400,38 +400,38 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
fztmp += fpz; fztmp += fpz;
if (NEWTON_PAIR) f[j].z -= fpz; if (NEWTON_PAIR) f[j].z -= fpz;
if (EFLAG) { if (EFLAG) {
sevdwl += evdwl; sevdwl += evdwl;
secoul += ecoul; secoul += ecoul;
if (eatom) { if (eatom) {
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
if (NEWTON_PAIR) if (NEWTON_PAIR)
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
} }
} }
if (NEWTON_PAIR == 0) if (NEWTON_PAIR == 0)
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
fpx, fpy, fpz); fpx, fpy, fpz);
} // for jj } // for jj
if (NEWTON_PAIR) { if (NEWTON_PAIR) {
f[i].x += fxtmp; f[i].x += fxtmp;
f[i].y += fytmp; f[i].y += fytmp;
f[i].z += fztmp; f[i].z += fztmp;
} else { } else {
f[i].x = fxtmp; f[i].x = fxtmp;
f[i].y = fytmp; f[i].y = fytmp;
f[i].z = fztmp; f[i].z = fztmp;
} }
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
} // for ii } // for ii
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
ov4, ov5); ov4, ov5);
} // end of omp parallel region } // end of omp parallel region
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5); ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) { if (EFLAG) {
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
@ -440,12 +440,12 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
} }
if (vflag) { if (vflag) {
if (NEWTON_PAIR == 0) { if (NEWTON_PAIR == 0) {
ov0 *= (acc_t)0.5; ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5; ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5; ov2 *= (acc_t)0.5;
ov3 *= (acc_t)0.5; ov3 *= (acc_t)0.5;
ov4 *= (acc_t)0.5; ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5; ov5 *= (acc_t)0.5;
} }
ev_global[2] = ov0; ev_global[2] = ov0;
ev_global[3] = ov1; ev_global[3] = ov1;
@ -486,7 +486,7 @@ void PairBuckCoulLongIntel::init_style()
error->all(FLERR, error->all(FLERR,
"The 'package intel' command is required for /intel styles"); "The 'package intel' command is required for /intel styles");
fix = static_cast<FixIntel *>(modify->fix[ifix]); fix = static_cast<FixIntel *>(modify->fix[ifix]);
fix->pair_init_check(); fix->pair_init_check();
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
_cop = fix->coprocessor_number(); _cop = fix->coprocessor_number();
@ -549,7 +549,7 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
for (int j = 0; j < tp1; j++) { for (int j = 0; j < tp1; j++) {
if (cutsq[i][j] < cut_ljsq[i][j]) if (cutsq[i][j] < cut_ljsq[i][j])
error->all(FLERR, error->all(FLERR,
"Intel variant of lj/buck/coul/long expects lj cutoff<=coulombic"); "Intel variant of lj/buck/coul/long expects lj cutoff<=coulombic");
fc.c_force[i][j].cutsq = cutsq[i][j]; fc.c_force[i][j].cutsq = cutsq[i][j];
fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j]; fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
fc.c_force[i][j].buck1 = buck1[i][j]; fc.c_force[i][j].buck1 = buck1[i][j];
@ -603,9 +603,9 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
template <class flt_t> template <class flt_t>
void PairBuckCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, void PairBuckCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
const int ntable, const int ntable,
Memory *memory, Memory *memory,
const int cop) { const int cop) {
if ( (ntypes != _ntypes || ntable != _ntable) ) { if ( (ntypes != _ntypes || ntable != _ntable) ) {
if (_ntypes > 0) { if (_ntypes > 0) {
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
@ -625,10 +625,10 @@ void PairBuckCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
ospecial_coul != NULL && _cop >= 0) { ospecial_coul != NULL && _cop >= 0) {
#pragma offload_transfer target(mic:cop) \ #pragma offload_transfer target(mic:cop) \
nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \ nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \ nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
nocopy(orho_inv: alloc_if(0) free_if(1)) \ nocopy(orho_inv: alloc_if(0) free_if(1)) \
nocopy(otable: alloc_if(0) free_if(1)) \ nocopy(otable: alloc_if(0) free_if(1)) \
nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1)) nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
} }
#endif #endif

View File

@ -50,8 +50,8 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong {
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void eval(const int offload, const int vflag, void eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> * buffers, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc, const int astart, const int aend); const ForceConst<flt_t> &fc, const int astart, const int aend);
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc, void pack_force_const(ForceConst<flt_t> &fc,
@ -76,7 +76,7 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong {
~ForceConst() { set_ntypes(0,0,NULL,_cop); } ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
void set_ntypes(const int ntypes, const int ntable, Memory *memory, void set_ntypes(const int ntypes, const int ntable, Memory *memory,
const int cop); const int cop);
private: private:
int _ntypes, _ntable, _cop; int _ntypes, _ntable, _cop;

View File

@ -48,7 +48,7 @@ PairBuckIntel::~PairBuckIntel()
void PairBuckIntel::compute(int eflag, int vflag) void PairBuckIntel::compute(int eflag, int vflag)
{ {
if (fix->precision()==FixIntel::PREC_MODE_MIXED) if (fix->precision()==FixIntel::PREC_MODE_MIXED)
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
force_const_single); force_const_single);
else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
compute<double,double>(eflag, vflag, fix->get_double_buffers(), compute<double,double>(eflag, vflag, fix->get_double_buffers(),
@ -63,8 +63,8 @@ void PairBuckIntel::compute(int eflag, int vflag)
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void PairBuckIntel::compute(int eflag, int vflag, void PairBuckIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
if (eflag || vflag) { if (eflag || vflag) {
ev_setup(eflag,vflag); ev_setup(eflag,vflag);
@ -87,13 +87,13 @@ void PairBuckIntel::compute(int eflag, int vflag,
#endif #endif
{ {
int ifrom, ito, tid; int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
packthreads, sizeof(ATOM_T)); packthreads, sizeof(ATOM_T));
buffers->thr_pack(ifrom,ito,ago); buffers->thr_pack(ifrom,ito,ago);
} }
fix->stop_watch(TIME_PACK); fix->stop_watch(TIME_PACK);
} }
int ovflag = 0; int ovflag = 0;
if (vflag_fdotr) ovflag = 2; if (vflag_fdotr) ovflag = 2;
else if (vflag) ovflag = 1; else if (vflag) ovflag = 1;
@ -120,9 +120,9 @@ void PairBuckIntel::compute(int eflag, int vflag,
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void PairBuckIntel::eval(const int offload, const int vflag, void PairBuckIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc, const ForceConst<flt_t> &fc,
const int astart, const int aend) const int astart, const int aend)
{ {
const int inum = aend - astart; const int inum = aend - astart;
if (inum == 0) return; if (inum == 0) return;
@ -147,8 +147,8 @@ void PairBuckIntel::eval(const int offload, const int vflag,
// Determine how much data to transfer // Determine how much data to transfer
int x_size, q_size, f_stride, ev_size, separate_flag; int x_size, q_size, f_stride, ev_size, separate_flag;
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
buffers, offload, fix, separate_flag, buffers, offload, fix, separate_flag,
x_size, q_size, ev_size, f_stride); x_size, q_size, ev_size, f_stride);
int tc; int tc;
FORCE_T * _noalias f_start; FORCE_T * _noalias f_start;
@ -160,7 +160,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
int *overflow = fix->get_off_overflow_flag(); int *overflow = fix->get_off_overflow_flag();
double *timer_compute = fix->off_watch_pair(); double *timer_compute = fix->off_watch_pair();
// Redeclare as local variables for offload // Redeclare as local variables for offload
if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY); if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
#pragma offload target(mic:_cop) if(offload) \ #pragma offload target(mic:_cop) if(offload) \
in(special_lj:length(0) alloc_if(0) free_if(0)) \ in(special_lj:length(0) alloc_if(0) free_if(0)) \
@ -182,8 +182,8 @@ void PairBuckIntel::eval(const int offload, const int vflag,
*timer_compute = MIC_Wtime(); *timer_compute = MIC_Wtime();
#endif #endif
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
f_stride, x, 0); f_stride, x, 0);
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = (acc_t)0; if (EFLAG) oevdwl = (acc_t)0;
@ -215,23 +215,23 @@ void PairBuckIntel::eval(const int offload, const int vflag,
const int jnum = numneigh[i]; const int jnum = numneigh[i];
acc_t fxtmp,fytmp,fztmp,fwtmp; acc_t fxtmp,fytmp,fztmp,fwtmp;
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
const flt_t xtmp = x[i].x; const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y; const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z; const flt_t ztmp = x[i].z;
fxtmp = fytmp = fztmp = (acc_t)0; fxtmp = fytmp = fztmp = (acc_t)0;
if (EFLAG) fwtmp = sevdwl = (acc_t)0; if (EFLAG) fwtmp = sevdwl = (acc_t)0;
if (NEWTON_PAIR == 0) if (NEWTON_PAIR == 0)
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5) sv0, sv1, sv2, sv3, sv4, sv5)
#endif #endif
for (int jj = 0; jj < jnum; jj++) { for (int jj = 0; jj < jnum; jj++) {
flt_t forcebuck, evdwl; flt_t forcebuck, evdwl;
forcebuck = evdwl = (flt_t)0.0; forcebuck = evdwl = (flt_t)0.0;
@ -245,7 +245,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
const flt_t rsq = delx * delx + dely * dely + delz * delz; const flt_t rsq = delx * delx + dely * dely + delz * delz;
const flt_t r = sqrt(rsq); const flt_t r = sqrt(rsq);
const flt_t r2inv = (flt_t)1.0 / rsq; const flt_t r2inv = (flt_t)1.0 / rsq;
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
if (rsq < c_forcei[jtype].cutsq) { if (rsq < c_forcei[jtype].cutsq) {
#endif #endif
@ -257,7 +257,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
#ifndef INTEL_VMASK #ifndef INTEL_VMASK
if (rsq > c_forcei[jtype].cutsq) if (rsq > c_forcei[jtype].cutsq)
forcebuck =(flt_t)0.0; forcebuck =(flt_t)0.0;
#endif #endif
if (EFLAG) { if (EFLAG) {
evdwl = rexp * c_energyi[jtype].a - evdwl = rexp * c_energyi[jtype].a -
r6inv * c_energyi[jtype].c - r6inv * c_energyi[jtype].c -
@ -272,67 +272,67 @@ void PairBuckIntel::eval(const int offload, const int vflag,
if (sbindex) { if (sbindex) {
const flt_t factor_lj = special_lj[sbindex]; const flt_t factor_lj = special_lj[sbindex];
forcebuck *= factor_lj; forcebuck *= factor_lj;
if (EFLAG) if (EFLAG)
evdwl *= factor_lj; evdwl *= factor_lj;
} }
const flt_t fpair = forcebuck * r2inv; const flt_t fpair = forcebuck * r2inv;
const flt_t fpx = fpair * delx; const flt_t fpx = fpair * delx;
fxtmp += fpx; fxtmp += fpx;
if (NEWTON_PAIR) f[j].x -= fpx; if (NEWTON_PAIR) f[j].x -= fpx;
const flt_t fpy = fpair * dely; const flt_t fpy = fpair * dely;
fytmp += fpy; fytmp += fpy;
if (NEWTON_PAIR) f[j].y -= fpy; if (NEWTON_PAIR) f[j].y -= fpy;
const flt_t fpz = fpair * delz; const flt_t fpz = fpair * delz;
fztmp += fpz; fztmp += fpz;
if (NEWTON_PAIR) f[j].z -= fpz; if (NEWTON_PAIR) f[j].z -= fpz;
if (EFLAG) { if (EFLAG) {
sevdwl += evdwl; sevdwl += evdwl;
if (eatom) { if (eatom) {
fwtmp += (flt_t)0.5 * evdwl; fwtmp += (flt_t)0.5 * evdwl;
if (NEWTON_PAIR) if (NEWTON_PAIR)
f[j].w += (flt_t)0.5 * evdwl; f[j].w += (flt_t)0.5 * evdwl;
} }
} }
if (NEWTON_PAIR == 0) if (NEWTON_PAIR == 0)
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz); IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
} }
#endif #endif
} // for jj } // for jj
if (NEWTON_PAIR) { if (NEWTON_PAIR) {
f[i].x += fxtmp; f[i].x += fxtmp;
f[i].y += fytmp; f[i].y += fytmp;
f[i].z += fztmp; f[i].z += fztmp;
} else { } else {
f[i].x = fxtmp; f[i].x = fxtmp;
f[i].y = fytmp; f[i].y = fytmp;
f[i].z = fztmp; f[i].z = fztmp;
} }
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
} // for ii } // for ii
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
ov4, ov5); ov4, ov5);
} // end of omp parallel region } // end of omp parallel region
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5); ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) { if (EFLAG) {
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
ev_global[0] = oevdwl; ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0; ev_global[1] = (acc_t)0;
} }
if (vflag) { if (vflag) {
if (NEWTON_PAIR == 0) { if (NEWTON_PAIR == 0) {
ov0 *= (acc_t)0.5; ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5; ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5; ov2 *= (acc_t)0.5;
ov3 *= (acc_t)0.5; ov3 *= (acc_t)0.5;
ov4 *= (acc_t)0.5; ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5; ov5 *= (acc_t)0.5;
} }
ev_global[2] = ov0; ev_global[2] = ov0;
ev_global[3] = ov1; ev_global[3] = ov1;
@ -371,7 +371,7 @@ void PairBuckIntel::init_style()
error->all(FLERR, error->all(FLERR,
"The 'package intel' command is required for /intel styles"); "The 'package intel' command is required for /intel styles");
fix = static_cast<FixIntel *>(modify->fix[ifix]); fix = static_cast<FixIntel *>(modify->fix[ifix]);
fix->pair_init_check(); fix->pair_init_check();
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
_cop = fix->coprocessor_number(); _cop = fix->coprocessor_number();
@ -442,7 +442,7 @@ void PairBuckIntel::pack_force_const(ForceConst<flt_t> &fc,
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
template <class flt_t> template <class flt_t>
void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
Memory *memory, Memory *memory,
const int cop) { const int cop) {
if ( (ntypes != _ntypes ) ) { if ( (ntypes != _ntypes ) ) {
@ -452,8 +452,8 @@ void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
c_force_t * oc_force = c_force[0]; c_force_t * oc_force = c_force[0];
c_energy_t * oc_energy = c_energy[0]; c_energy_t * oc_energy = c_energy[0];
if (ospecial_lj != NULL && oc_force != NULL && if (ospecial_lj != NULL && oc_force != NULL &&
oc_energy != NULL && oc_energy != NULL &&
_cop >= 0) { _cop >= 0) {
#pragma offload_transfer target(mic:cop) \ #pragma offload_transfer target(mic:cop) \
nocopy(ospecial_lj: alloc_if(0) free_if(1)) \ nocopy(ospecial_lj: alloc_if(0) free_if(1)) \
@ -476,8 +476,8 @@ void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
c_force_t * oc_force = c_force[0]; c_force_t * oc_force = c_force[0];
c_energy_t * oc_energy = c_energy[0]; c_energy_t * oc_energy = c_energy[0];
int tp1sq = ntypes*ntypes; int tp1sq = ntypes*ntypes;
if (ospecial_lj != NULL && oc_force != NULL && if (ospecial_lj != NULL && oc_force != NULL &&
oc_energy != NULL && oc_energy != NULL &&
cop >= 0) { cop >= 0) {
#pragma offload_transfer target(mic:cop) \ #pragma offload_transfer target(mic:cop) \
nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \ nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \

View File

@ -50,8 +50,8 @@ private:
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void eval(const int offload, const int vflag, void eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> * buffers, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc, const int astart, const int aend); const ForceConst<flt_t> &fc, const int astart, const int aend);
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc, void pack_force_const(ForceConst<flt_t> &fc,
@ -59,7 +59,7 @@ private:
template <class flt_t> template <class flt_t>
class ForceConst { class ForceConst {
public: public:
typedef struct { flt_t buck1, buck2, rhoinv, cutsq; } c_force_t; typedef struct { flt_t buck1, buck2, rhoinv, cutsq; } c_force_t;
typedef struct { flt_t a, c, offset, pad; } c_energy_t; typedef struct { flt_t a, c, offset, pad; } c_energy_t;
@ -78,7 +78,7 @@ private:
int _ntypes, _cop; int _ntypes, _cop;
Memory *_memory; Memory *_memory;
}; };
ForceConst<float> force_const_single; ForceConst<float> force_const_single;
ForceConst<double> force_const_double; ForceConst<double> force_const_double;
}; };

View File

@ -74,8 +74,8 @@ void PairEAMIntel::compute(int eflag, int vflag)
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void PairEAMIntel::compute(int eflag, int vflag, void PairEAMIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
if (eflag || vflag) { if (eflag || vflag) {
ev_setup(eflag, vflag); ev_setup(eflag, vflag);
@ -111,37 +111,37 @@ void PairEAMIntel::compute(int eflag, int vflag,
if (_onetype) { if (_onetype) {
if (eflag) { if (eflag) {
if (force->newton_pair) { if (force->newton_pair) {
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else { } else {
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
} }
} else { } else {
if (force->newton_pair) { if (force->newton_pair) {
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else { } else {
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
} }
} }
} else { } else {
if (eflag) { if (eflag) {
if (force->newton_pair) { if (force->newton_pair) {
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end); eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum); eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else { } else {
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end); eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum); eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
} }
} else { } else {
if (force->newton_pair) { if (force->newton_pair) {
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end); eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum); eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else { } else {
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end); eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum); eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
} }
} }
} }
@ -151,8 +151,8 @@ void PairEAMIntel::compute(int eflag, int vflag,
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void PairEAMIntel::eval(const int offload, const int vflag, void PairEAMIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc, const ForceConst<flt_t> &fc,
const int astart, const int aend) const int astart, const int aend)
{ {
const int inum = aend - astart; const int inum = aend - astart;
@ -251,8 +251,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
#endif #endif
{ {
int iifrom, iito, tid; int iifrom, iito, tid;
IP_PRE_omp_range_id_vec(iifrom, iito, tid, inum, nthreads, IP_PRE_omp_range_id_vec(iifrom, iito, tid, inum, nthreads,
INTEL_VECTOR_WIDTH); INTEL_VECTOR_WIDTH);
iifrom += astart; iifrom += astart;
iito += astart; iito += astart;
@ -264,8 +264,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
else foff = 0; else foff = 0;
double * _noalias const trho = rho + foff; double * _noalias const trho = rho + foff;
if (NEWTON_PAIR) { if (NEWTON_PAIR) {
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
memset(trho, 0, nall * sizeof(double)); memset(trho, 0, nall * sizeof(double));
} }
const int toffs = tid * ccache_stride; const int toffs = tid * ccache_stride;
@ -280,108 +280,108 @@ void PairEAMIntel::eval(const int offload, const int vflag,
int rhor_joff, frho_ioff; int rhor_joff, frho_ioff;
if (ONETYPE) { if (ONETYPE) {
const int ptr_off=_onetype * ntypes + _onetype; const int ptr_off=_onetype * ntypes + _onetype;
oscale = scale_f[ptr_off]; oscale = scale_f[ptr_off];
int rhor_ioff = istride * _onetype; int rhor_ioff = istride * _onetype;
rhor_joff = rhor_ioff + _onetype * jstride; rhor_joff = rhor_ioff + _onetype * jstride;
frho_ioff = fstride * _onetype; frho_ioff = fstride * _onetype;
} }
for (int i = iifrom; i < iito; ++i) { for (int i = iifrom; i < iito; ++i) {
int itype, rhor_ioff; int itype, rhor_ioff;
if (!ONETYPE) { if (!ONETYPE) {
itype = x[i].w; itype = x[i].w;
rhor_ioff = istride * itype; rhor_ioff = istride * itype;
} }
const int * _noalias const jlist = firstneigh + cnumneigh[i]; const int * _noalias const jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i]; const int jnum = numneigh[i];
const flt_t xtmp = x[i].x; const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y; const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z; const flt_t ztmp = x[i].z;
acc_t rhoi = (acc_t)0.0; acc_t rhoi = (acc_t)0.0;
int ej = 0; int ej = 0;
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma ivdep #pragma ivdep
#endif #endif
for (int jj = 0; jj < jnum; jj++) { for (int jj = 0; jj < jnum; jj++) {
const int j = jlist[jj] & NEIGHMASK; const int j = jlist[jj] & NEIGHMASK;
const flt_t delx = xtmp - x[j].x; const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y; const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z; const flt_t delz = ztmp - x[j].z;
const flt_t rsq = delx*delx + dely*dely + delz*delz; const flt_t rsq = delx*delx + dely*dely + delz*delz;
if (rsq < fcutforcesq) { if (rsq < fcutforcesq) {
trsq[ej]=rsq; trsq[ej]=rsq;
if (!ONETYPE) tjtype[ej]=x[j].w; if (!ONETYPE) tjtype[ej]=x[j].w;
tj[ej]=jlist[jj]; tj[ej]=jlist[jj];
ej++; ej++;
} }
} }
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma simd reduction(+:rhoi) #pragma simd reduction(+:rhoi)
#endif #endif
for (int jj = 0; jj < ej; jj++) { for (int jj = 0; jj < ej; jj++) {
int jtype; int jtype;
const int j = tj[jj] & NEIGHMASK; const int j = tj[jj] & NEIGHMASK;
if (!ONETYPE) jtype = tjtype[jj]; if (!ONETYPE) jtype = tjtype[jj];
const flt_t rsq = trsq[jj]; const flt_t rsq = trsq[jj];
flt_t p = sqrt(rsq)*frdr + (flt_t)1.0; flt_t p = sqrt(rsq)*frdr + (flt_t)1.0;
int m = static_cast<int> (p); int m = static_cast<int> (p);
m = MIN(m,nr-1); m = MIN(m,nr-1);
p -= m; p -= m;
p = MIN(p,(flt_t)1.0); p = MIN(p,(flt_t)1.0);
if (!ONETYPE) if (!ONETYPE)
rhor_joff = rhor_ioff + jtype * jstride; rhor_joff = rhor_ioff + jtype * jstride;
const int joff = rhor_joff + m; const int joff = rhor_joff + m;
flt_t ra; flt_t ra;
ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p + ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p +
rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d; rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d;
rhoi += ra; rhoi += ra;
if (NEWTON_PAIR) { if (NEWTON_PAIR) {
if (!ONETYPE) { if (!ONETYPE) {
const int ioff = jtype * istride + itype * jstride + m; const int ioff = jtype * istride + itype * jstride + m;
ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p + ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p +
rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d; rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d;
} }
trho[j] += ra; trho[j] += ra;
} }
} // for jj } // for jj
if (NEWTON_PAIR) if (NEWTON_PAIR)
trho[i] += rhoi; trho[i] += rhoi;
else else
trho[i] = rhoi; trho[i] = rhoi;
} // for i } // for i
#if defined(_OPENMP) #if defined(_OPENMP)
if (NEWTON_PAIR && nthreads > 1) { if (NEWTON_PAIR && nthreads > 1) {
#pragma omp barrier #pragma omp barrier
if (tid == 0) { if (tid == 0) {
const int rcount = nall; const int rcount = nall;
if (nthreads == 2) { if (nthreads == 2) {
double *trho2 = rho + nmax; double *trho2 = rho + nmax;
#pragma vector aligned #pragma vector aligned
#pragma simd #pragma simd
for (int n = 0; n < rcount; n++) for (int n = 0; n < rcount; n++)
rho[n] += trho2[n]; rho[n] += trho2[n];
} else if (nthreads == 4) { } else if (nthreads == 4) {
double *trho2 = rho + nmax; double *trho2 = rho + nmax;
double *trho3 = trho2 + nmax; double *trho3 = trho2 + nmax;
double *trho4 = trho3 + nmax; double *trho4 = trho3 + nmax;
#pragma vector aligned #pragma vector aligned
#pragma simd #pragma simd
for (int n = 0; n < rcount; n++) for (int n = 0; n < rcount; n++)
rho[n] += trho2[n] + trho3[n] + trho4[n]; rho[n] += trho2[n] + trho3[n] + trho4[n];
} else { } else {
double *trhon = rho + nmax; double *trhon = rho + nmax;
for (int t = 1; t < nthreads; t++) { for (int t = 1; t < nthreads; t++) {
#pragma vector aligned #pragma vector aligned
#pragma simd #pragma simd
for (int n = 0; n < rcount; n++) for (int n = 0; n < rcount; n++)
rho[n] += trhon[n]; rho[n] += trhon[n];
trhon += nmax; trhon += nmax;
} }
} }
} }
@ -411,32 +411,32 @@ void PairEAMIntel::eval(const int offload, const int vflag,
#pragma simd reduction(+:tevdwl) #pragma simd reduction(+:tevdwl)
#endif #endif
for (int i = iifrom; i < iito; ++i) { for (int i = iifrom; i < iito; ++i) {
int itype; int itype;
if (!ONETYPE) itype = x[i].w; if (!ONETYPE) itype = x[i].w;
flt_t p = rho[i]*frdrho + (flt_t)1.0; flt_t p = rho[i]*frdrho + (flt_t)1.0;
int m = static_cast<int> (p); int m = static_cast<int> (p);
m = MAX(1,MIN(m,nrho-1)); m = MAX(1,MIN(m,nrho-1));
p -= m; p -= m;
p = MIN(p,(flt_t)1.0); p = MIN(p,(flt_t)1.0);
if (!ONETYPE) frho_ioff = itype * fstride; if (!ONETYPE) frho_ioff = itype * fstride;
const int ioff = frho_ioff + m; const int ioff = frho_ioff + m;
fp_f[i] = (frho_spline_f[ioff].a*p + frho_spline_f[ioff].b)*p + fp_f[i] = (frho_spline_f[ioff].a*p + frho_spline_f[ioff].b)*p +
frho_spline_f[ioff].c; frho_spline_f[ioff].c;
if (EFLAG) { if (EFLAG) {
flt_t phi = ((frho_spline_e[ioff].a*p + frho_spline_e[ioff].b)*p + flt_t phi = ((frho_spline_e[ioff].a*p + frho_spline_e[ioff].b)*p +
frho_spline_e[ioff].c)*p + frho_spline_e[ioff].d; frho_spline_e[ioff].c)*p + frho_spline_e[ioff].d;
if (rho[i] > frhomax) phi += fp_f[i] * (rho[i]-frhomax); if (rho[i] > frhomax) phi += fp_f[i] * (rho[i]-frhomax);
if (!ONETYPE) { if (!ONETYPE) {
const int ptr_off=itype*ntypes + itype; const int ptr_off=itype*ntypes + itype;
oscale = scale_f[ptr_off]; oscale = scale_f[ptr_off];
} }
phi *= oscale; phi *= oscale;
tevdwl += phi; tevdwl += phi;
if (eatom) f[i].w += phi; if (eatom) f[i].w += phi;
} }
} }
if (EFLAG) oevdwl += tevdwl; if (EFLAG) oevdwl += tevdwl;
// communicate derivative of embedding function // communicate derivative of embedding function
@ -447,7 +447,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
if (tid == 0) if (tid == 0)
comm->forward_comm_pair(this); comm->forward_comm_pair(this);
if (NEWTON_PAIR) if (NEWTON_PAIR)
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp barrier #pragma omp barrier
@ -458,94 +458,94 @@ void PairEAMIntel::eval(const int offload, const int vflag,
for (int i = iifrom; i < iito; ++i) { for (int i = iifrom; i < iito; ++i) {
int itype, rhor_ioff; int itype, rhor_ioff;
const flt_t * _noalias scale_fi; const flt_t * _noalias scale_fi;
if (!ONETYPE) { if (!ONETYPE) {
itype = x[i].w; itype = x[i].w;
rhor_ioff = istride * itype; rhor_ioff = istride * itype;
scale_fi = scale_f + itype*ntypes; scale_fi = scale_f + itype*ntypes;
} }
const int * _noalias const jlist = firstneigh + cnumneigh[i]; const int * _noalias const jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i]; const int jnum = numneigh[i];
acc_t fxtmp, fytmp, fztmp, fwtmp; acc_t fxtmp, fytmp, fztmp, fwtmp;
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
const flt_t xtmp = x[i].x; const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y; const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z; const flt_t ztmp = x[i].z;
fxtmp = fytmp = fztmp = (acc_t)0; fxtmp = fytmp = fztmp = (acc_t)0;
if (EFLAG) fwtmp = sevdwl = (acc_t)0; if (EFLAG) fwtmp = sevdwl = (acc_t)0;
if (NEWTON_PAIR == 0) if (NEWTON_PAIR == 0)
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
int ej = 0; int ej = 0;
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma ivdep #pragma ivdep
#endif #endif
for (int jj = 0; jj < jnum; jj++) { for (int jj = 0; jj < jnum; jj++) {
const int j = jlist[jj] & NEIGHMASK; const int j = jlist[jj] & NEIGHMASK;
const flt_t delx = xtmp - x[j].x; const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y; const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z; const flt_t delz = ztmp - x[j].z;
const flt_t rsq = delx*delx + dely*dely + delz*delz; const flt_t rsq = delx*delx + dely*dely + delz*delz;
if (rsq < fcutforcesq) { if (rsq < fcutforcesq) {
trsq[ej]=rsq; trsq[ej]=rsq;
tdelx[ej]=delx; tdelx[ej]=delx;
tdely[ej]=dely; tdely[ej]=dely;
tdelz[ej]=delz; tdelz[ej]=delz;
if (!ONETYPE) tjtype[ej]=x[j].w; if (!ONETYPE) tjtype[ej]=x[j].w;
tj[ej]=jlist[jj]; tj[ej]=jlist[jj];
ej++; ej++;
} }
} }
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5) sv0, sv1, sv2, sv3, sv4, sv5)
#endif #endif
for (int jj = 0; jj < ej; jj++) { for (int jj = 0; jj < ej; jj++) {
int jtype; int jtype;
const int j = tj[jj] & NEIGHMASK; const int j = tj[jj] & NEIGHMASK;
if (!ONETYPE) jtype = tjtype[jj]; if (!ONETYPE) jtype = tjtype[jj];
const flt_t rsq = trsq[jj]; const flt_t rsq = trsq[jj];
const flt_t r = sqrt(rsq); const flt_t r = sqrt(rsq);
flt_t p = r*frdr + (flt_t)1.0; flt_t p = r*frdr + (flt_t)1.0;
int m = static_cast<int> (p); int m = static_cast<int> (p);
m = MIN(m,nr-1); m = MIN(m,nr-1);
p -= m; p -= m;
p = MIN(p,(flt_t)1.0); p = MIN(p,(flt_t)1.0);
if (!ONETYPE) if (!ONETYPE)
rhor_joff = rhor_ioff + jtype * jstride; rhor_joff = rhor_ioff + jtype * jstride;
const int joff = rhor_joff + m; const int joff = rhor_joff + m;
const flt_t rhojp = (rhor_spline_f[joff].a*p + const flt_t rhojp = (rhor_spline_f[joff].a*p +
rhor_spline_f[joff].b)*p + rhor_spline_f[joff].b)*p +
rhor_spline_f[joff].c; rhor_spline_f[joff].c;
flt_t rhoip; flt_t rhoip;
if (!ONETYPE) { if (!ONETYPE) {
const int ioff = jtype * istride + itype * jstride + m; const int ioff = jtype * istride + itype * jstride + m;
rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p + rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p +
rhor_spline_f[ioff].c; rhor_spline_f[ioff].c;
} else } else
rhoip = rhojp; rhoip = rhojp;
const flt_t z2p = (z2r_spline_t[joff].a*p + const flt_t z2p = (z2r_spline_t[joff].a*p +
z2r_spline_t[joff].b)*p + z2r_spline_t[joff].b)*p +
z2r_spline_t[joff].c; z2r_spline_t[joff].c;
const flt_t z2 = ((z2r_spline_t[joff].d*p + const flt_t z2 = ((z2r_spline_t[joff].d*p +
z2r_spline_t[joff].e)*p + z2r_spline_t[joff].e)*p +
z2r_spline_t[joff].f)*p + z2r_spline_t[joff].f)*p +
z2r_spline_t[joff].g; z2r_spline_t[joff].g;
const flt_t recip = (flt_t)1.0/r; const flt_t recip = (flt_t)1.0/r;
const flt_t phi = z2*recip; const flt_t phi = z2*recip;
const flt_t phip = z2p*recip - phi*recip; const flt_t phip = z2p*recip - phi*recip;
const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip; const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip;
if (!ONETYPE) if (!ONETYPE)
oscale = scale_fi[jtype]; oscale = scale_fi[jtype];
const flt_t fpair = -oscale*psip*recip; const flt_t fpair = -oscale*psip*recip;
const flt_t fpx = fpair * tdelx[jj]; const flt_t fpx = fpair * tdelx[jj];
fxtmp += fpx; fxtmp += fpx;
if (NEWTON_PAIR) f[j].x -= fpx; if (NEWTON_PAIR) f[j].x -= fpx;
@ -556,20 +556,20 @@ void PairEAMIntel::eval(const int offload, const int vflag,
fztmp += fpz; fztmp += fpz;
if (NEWTON_PAIR) f[j].z -= fpz; if (NEWTON_PAIR) f[j].z -= fpz;
if (EFLAG) { if (EFLAG) {
const flt_t evdwl = oscale*phi; const flt_t evdwl = oscale*phi;
sevdwl += evdwl; sevdwl += evdwl;
if (eatom) { if (eatom) {
fwtmp += (flt_t)0.5 * evdwl; fwtmp += (flt_t)0.5 * evdwl;
if (NEWTON_PAIR) if (NEWTON_PAIR)
f[j].w += (flt_t)0.5 * evdwl; f[j].w += (flt_t)0.5 * evdwl;
} }
} }
if (NEWTON_PAIR == 0) if (NEWTON_PAIR == 0)
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
fpx, fpy, fpz); fpx, fpy, fpz);
} // for jj } // for jj
if (NEWTON_PAIR) { if (NEWTON_PAIR) {
f[i].x += fxtmp; f[i].x += fxtmp;
f[i].y += fytmp; f[i].y += fytmp;
f[i].z += fztmp; f[i].z += fztmp;
@ -577,19 +577,19 @@ void PairEAMIntel::eval(const int offload, const int vflag,
f[i].x = fxtmp; f[i].x = fxtmp;
f[i].y = fytmp; f[i].y = fytmp;
f[i].z = fztmp; f[i].z = fztmp;
sevdwl *= (acc_t)0.5; sevdwl *= (acc_t)0.5;
} }
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
} // for i } // for i
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
ov4, ov5); ov4, ov5);
} /// omp } /// omp
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5); ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) { if (EFLAG) {
ev_global[0] = oevdwl; ev_global[0] = oevdwl;
@ -597,13 +597,13 @@ void PairEAMIntel::eval(const int offload, const int vflag,
} }
if (vflag) { if (vflag) {
if (NEWTON_PAIR == 0) { if (NEWTON_PAIR == 0) {
ov0 *= (acc_t)0.5; ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5; ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5; ov2 *= (acc_t)0.5;
ov3 *= (acc_t)0.5; ov3 *= (acc_t)0.5;
ov4 *= (acc_t)0.5; ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5; ov5 *= (acc_t)0.5;
} }
ev_global[2] = ov0; ev_global[2] = ov0;
ev_global[3] = ov1; ev_global[3] = ov1;
ev_global[4] = ov2; ev_global[4] = ov2;
@ -665,7 +665,7 @@ void PairEAMIntel::init_style()
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc, void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers) IntelBuffers<flt_t,acc_t> *buffers)
{ {
int off_ccache = 0; int off_ccache = 0;
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
@ -684,14 +684,14 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
for (int i = 1; i <= atom->ntypes; i++) { for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) { for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j); cut = init_one(i,j);
cutneigh = cut + neighbor->skin; cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut; cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
} }
} }
} }
_onetype=-1; _onetype=-1;
double oldscale=-1; double oldscale=-1;
for (int i = 1; i < tp1; i++) { for (int i = 1; i < tp1; i++) {
@ -709,32 +709,32 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
for (int j = 1; j < tp1; j++) { for (int j = 1; j < tp1; j++) {
fc.scale_f[i][j] = scale[i][j]; fc.scale_f[i][j] = scale[i][j];
if (type2rhor[i][j] >= 0) { if (type2rhor[i][j] >= 0) {
const int joff = ioff + j * fc.rhor_jstride(); const int joff = ioff + j * fc.rhor_jstride();
for (int k = 0; k < nr + 1; k++) { for (int k = 0; k < nr + 1; k++) {
if (type2rhor[j][i] != type2rhor[i][j]) if (type2rhor[j][i] != type2rhor[i][j])
_onetype = 0; _onetype = 0;
else if (_onetype < 0) else if (_onetype < 0)
_onetype = i; _onetype = i;
if (oldscale < 0) if (oldscale < 0)
oldscale = scale[i][j]; oldscale = scale[i][j];
else else
if (oldscale != scale[i][j]) if (oldscale != scale[i][j])
_onetype = 0; _onetype = 0;
fc.rhor_spline_f[joff + k].a=rhor_spline[type2rhor[j][i]][k][0]; fc.rhor_spline_f[joff + k].a=rhor_spline[type2rhor[j][i]][k][0];
fc.rhor_spline_f[joff + k].b=rhor_spline[type2rhor[j][i]][k][1]; fc.rhor_spline_f[joff + k].b=rhor_spline[type2rhor[j][i]][k][1];
fc.rhor_spline_f[joff + k].c=rhor_spline[type2rhor[j][i]][k][2]; fc.rhor_spline_f[joff + k].c=rhor_spline[type2rhor[j][i]][k][2];
fc.rhor_spline_e[joff + k].a=rhor_spline[type2rhor[j][i]][k][3]; fc.rhor_spline_e[joff + k].a=rhor_spline[type2rhor[j][i]][k][3];
fc.rhor_spline_e[joff + k].b=rhor_spline[type2rhor[j][i]][k][4]; fc.rhor_spline_e[joff + k].b=rhor_spline[type2rhor[j][i]][k][4];
fc.rhor_spline_e[joff + k].c=rhor_spline[type2rhor[j][i]][k][5]; fc.rhor_spline_e[joff + k].c=rhor_spline[type2rhor[j][i]][k][5];
fc.rhor_spline_e[joff + k].d=rhor_spline[type2rhor[j][i]][k][6]; fc.rhor_spline_e[joff + k].d=rhor_spline[type2rhor[j][i]][k][6];
fc.z2r_spline_t[joff + k].a=z2r_spline[type2rhor[j][i]][k][0]; fc.z2r_spline_t[joff + k].a=z2r_spline[type2rhor[j][i]][k][0];
fc.z2r_spline_t[joff + k].b=z2r_spline[type2rhor[j][i]][k][1]; fc.z2r_spline_t[joff + k].b=z2r_spline[type2rhor[j][i]][k][1];
fc.z2r_spline_t[joff + k].c=z2r_spline[type2rhor[j][i]][k][2]; fc.z2r_spline_t[joff + k].c=z2r_spline[type2rhor[j][i]][k][2];
fc.z2r_spline_t[joff + k].d=z2r_spline[type2rhor[j][i]][k][3]; fc.z2r_spline_t[joff + k].d=z2r_spline[type2rhor[j][i]][k][3];
fc.z2r_spline_t[joff + k].e=z2r_spline[type2rhor[j][i]][k][4]; fc.z2r_spline_t[joff + k].e=z2r_spline[type2rhor[j][i]][k][4];
fc.z2r_spline_t[joff + k].f=z2r_spline[type2rhor[j][i]][k][5]; fc.z2r_spline_t[joff + k].f=z2r_spline[type2rhor[j][i]][k][5];
fc.z2r_spline_t[joff + k].g=z2r_spline[type2rhor[j][i]][k][6]; fc.z2r_spline_t[joff + k].g=z2r_spline[type2rhor[j][i]][k][6];
} }
} }
} }
} }
@ -745,9 +745,9 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
template <class flt_t> template <class flt_t>
void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
const int nr, const int nrho, const int nr, const int nrho,
Memory *memory, Memory *memory,
const int cop) { const int cop) {
if (ntypes != _ntypes || nr > _nr || nrho > _nrho) { if (ntypes != _ntypes || nr > _nr || nrho > _nrho) {
if (_ntypes > 0) { if (_ntypes > 0) {
_memory->destroy(rhor_spline_f); _memory->destroy(rhor_spline_f);
@ -780,7 +780,7 @@ void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf, int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf,
int pbc_flag, int *pbc) int pbc_flag, int *pbc)
{ {
if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
return pack_forward_comm(n, list, buf, fp); return pack_forward_comm(n, list, buf, fp);
@ -802,7 +802,7 @@ void PairEAMIntel::unpack_forward_comm(int n, int first, double *buf)
template<class flt_t> template<class flt_t>
int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf, int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf,
flt_t *fp_f) flt_t *fp_f)
{ {
int i,j,m; int i,j,m;
@ -817,8 +817,8 @@ int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf,
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
template<class flt_t> template<class flt_t>
void PairEAMIntel::unpack_forward_comm(int n, int first, double *buf, void PairEAMIntel::unpack_forward_comm(int n, int first, double *buf,
flt_t *fp_f) flt_t *fp_f)
{ {
int i,m,last; int i,m,last;

View File

@ -53,8 +53,8 @@ class PairEAMIntel : public PairEAM {
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t,
class acc_t> class acc_t>
void eval(const int offload, const int vflag, void eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> * buffers, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc, const int astart, const int aend); const ForceConst<flt_t> &fc, const int astart, const int aend);
@ -79,8 +79,8 @@ class PairEAMIntel : public PairEAM {
ForceConst() : _ntypes(0), _nr(0) {} ForceConst() : _ntypes(0), _nr(0) {}
~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); } ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
void set_ntypes(const int ntypes, const int nr, const int nrho, void set_ntypes(const int ntypes, const int nr, const int nrho,
Memory *memory, const int cop); Memory *memory, const int cop);
inline int rhor_jstride() const { return _nr; } inline int rhor_jstride() const { return _nr; }
inline int rhor_istride() const { return _nr * _ntypes; } inline int rhor_istride() const { return _nr * _ntypes; }
inline int frho_stride() const { return _nrho; } inline int frho_stride() const { return _nrho; }

View File

@ -98,17 +98,17 @@ void PairGayBerneIntel::compute(int eflag, int vflag,
{ {
int ifrom, ito, tid; int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, packthreads, IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, packthreads,
sizeof(ATOM_T)); sizeof(ATOM_T));
if (ago != 0) buffers->thr_pack(ifrom,ito,ago); if (ago != 0) buffers->thr_pack(ifrom,ito,ago);
for (int i = ifrom; i < ito; i++) { for (int i = ifrom; i < ito; i++) {
int qi = ellipsoid[i]; int qi = ellipsoid[i];
if (qi > -1) { if (qi > -1) {
quat[i].w = bonus[qi].quat[0]; quat[i].w = bonus[qi].quat[0];
quat[i].i = bonus[qi].quat[1]; quat[i].i = bonus[qi].quat[1];
quat[i].j = bonus[qi].quat[2]; quat[i].j = bonus[qi].quat[2];
quat[i].k = bonus[qi].quat[3]; quat[i].k = bonus[qi].quat[3];
} }
} }
} }
quat[nall].w = (flt_t)1.0; quat[nall].w = (flt_t)1.0;
@ -161,65 +161,65 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
if (fix->separate_buffers()) { if (fix->separate_buffers()) {
fix->start_watch(TIME_PACK); fix->start_watch(TIME_PACK);
if (offload) { if (offload) {
#pragma omp parallel #pragma omp parallel
{ {
int ifrom, ito, tid; int ifrom, ito, tid;
int nthreads = comm->nthreads; int nthreads = comm->nthreads;
IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal, IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,
nthreads, sizeof(ATOM_T)); nthreads, sizeof(ATOM_T));
if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0); if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0);
for (int i = ifrom; i < ito; i++) { for (int i = ifrom; i < ito; i++) {
int qi = ellipsoid[i]; int qi = ellipsoid[i];
if (qi > -1) { if (qi > -1) {
quat[i].w = bonus[qi].quat[0]; quat[i].w = bonus[qi].quat[0];
quat[i].i = bonus[qi].quat[1]; quat[i].i = bonus[qi].quat[1];
quat[i].j = bonus[qi].quat[2]; quat[i].j = bonus[qi].quat[2];
quat[i].k = bonus[qi].quat[3]; quat[i].k = bonus[qi].quat[3];
} }
} }
int nghost = nall - nlocal; int nghost = nall - nlocal;
if (nghost) { if (nghost) {
IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal, IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,
nthreads, sizeof(ATOM_T)); nthreads, sizeof(ATOM_T));
int offset = 0; int offset = 0;
ifrom += nlocal; ifrom += nlocal;
ito += nlocal; ito += nlocal;
if (ago != 0) { if (ago != 0) {
offset = fix->offload_min_ghost() - nlocal; offset = fix->offload_min_ghost() - nlocal;
buffers->thr_pack_cop(ifrom, ito, offset, ago == 1); buffers->thr_pack_cop(ifrom, ito, offset, ago == 1);
} }
for (int i = ifrom; i < ito; i++) { for (int i = ifrom; i < ito; i++) {
int qi = ellipsoid[i + offset]; int qi = ellipsoid[i + offset];
if (qi > -1) { if (qi > -1) {
quat[i].w = bonus[qi].quat[0]; quat[i].w = bonus[qi].quat[0];
quat[i].i = bonus[qi].quat[1]; quat[i].i = bonus[qi].quat[1];
quat[i].j = bonus[qi].quat[2]; quat[i].j = bonus[qi].quat[2];
quat[i].k = bonus[qi].quat[3]; quat[i].k = bonus[qi].quat[3];
} }
} }
} }
} }
} else { } else {
if (ago != 0) buffers->thr_pack_host(fix->host_min_local(), nlocal, 0); if (ago != 0) buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);
for (int i = fix->host_min_local(); i < nlocal; i++) { for (int i = fix->host_min_local(); i < nlocal; i++) {
int qi = ellipsoid[i]; int qi = ellipsoid[i];
if (qi > -1) { if (qi > -1) {
quat[i].w = bonus[qi].quat[0]; quat[i].w = bonus[qi].quat[0];
quat[i].i = bonus[qi].quat[1]; quat[i].i = bonus[qi].quat[1];
quat[i].j = bonus[qi].quat[2]; quat[i].j = bonus[qi].quat[2];
quat[i].k = bonus[qi].quat[3]; quat[i].k = bonus[qi].quat[3];
} }
} }
int offset = fix->host_min_ghost() - nlocal; int offset = fix->host_min_ghost() - nlocal;
if (ago != 0) buffers->thr_pack_host(nlocal, nall, offset); if (ago != 0) buffers->thr_pack_host(nlocal, nall, offset);
for (int i = nlocal; i < nall; i++) { for (int i = nlocal; i < nall; i++) {
int qi = ellipsoid[i + offset]; int qi = ellipsoid[i + offset];
if (qi > -1) { if (qi > -1) {
quat[i].w = bonus[qi].quat[0]; quat[i].w = bonus[qi].quat[0];
quat[i].i = bonus[qi].quat[1]; quat[i].i = bonus[qi].quat[1];
quat[i].j = bonus[qi].quat[2]; quat[i].j = bonus[qi].quat[2];
quat[i].k = bonus[qi].quat[3]; quat[i].k = bonus[qi].quat[3];
} }
} }
} }
fix->stop_watch(TIME_PACK); fix->stop_watch(TIME_PACK);
@ -252,8 +252,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
// Determine how much data to transfer // Determine how much data to transfer
int x_size, q_size, f_stride, ev_size, separate_flag; int x_size, q_size, f_stride, ev_size, separate_flag;
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
buffers, offload, fix, separate_flag, buffers, offload, fix, separate_flag,
x_size, q_size, ev_size, f_stride); x_size, q_size, ev_size, f_stride);
int tc; int tc;
FORCE_T * _noalias f_start; FORCE_T * _noalias f_start;
@ -303,26 +303,26 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
if (separate_flag) { if (separate_flag) {
if (separate_flag < 3) { if (separate_flag < 3) {
int all_local = nlocal; int all_local = nlocal;
int ghost_min = overflow[LMP_GHOST_MIN]; int ghost_min = overflow[LMP_GHOST_MIN];
nlocal = overflow[LMP_LOCAL_MAX] + 1; nlocal = overflow[LMP_LOCAL_MAX] + 1;
int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min; int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;
if (nghost < 0) nghost = 0; if (nghost < 0) nghost = 0;
nall = nlocal + nghost; nall = nlocal + nghost;
separate_flag--; separate_flag--;
int flength; int flength;
if (NEWTON_PAIR) flength = nall; if (NEWTON_PAIR) flength = nall;
else flength = nlocal; else flength = nlocal;
IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T), IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),
separate_flag); separate_flag);
if (nghost) { if (nghost) {
if (nlocal < all_local || ghost_min > all_local) { if (nlocal < all_local || ghost_min > all_local) {
memmove(x + nlocal, x + ghost_min, memmove(x + nlocal, x + ghost_min,
(nall - nlocal) * sizeof(ATOM_T)); (nall - nlocal) * sizeof(ATOM_T));
memmove(quat + nlocal, quat + ghost_min, memmove(quat + nlocal, quat + ghost_min,
(nall - nlocal) * sizeof(QUAT_T)); (nall - nlocal) * sizeof(QUAT_T));
} }
} }
} }
x[nall].x = (flt_t)INTEL_BIGP; x[nall].x = (flt_t)INTEL_BIGP;
x[nall].y = (flt_t)INTEL_BIGP; x[nall].y = (flt_t)INTEL_BIGP;
@ -395,17 +395,17 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0; fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
if (EFLAG) fwtmp = sevdwl = (acc_t)0.0; if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
if (NEWTON_PAIR == 0) if (NEWTON_PAIR == 0)
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
bool multiple_forms = false; bool multiple_forms = false;
int packed_j = 0; int packed_j = 0;
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma ivdep #pragma ivdep
#endif #endif
for (int jj = 0; jj < jnum; jj++) { for (int jj = 0; jj < jnum; jj++) {
int jm = jlist[jj]; int jm = jlist[jj];
int j = jm & NEIGHMASK; int j = jm & NEIGHMASK;
const int jtype = x[j].w; const int jtype = x[j].w;
@ -428,27 +428,27 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
} else } else
multiple_forms = true; multiple_forms = true;
} }
const int edge = (packed_j % pad_width); const int edge = (packed_j % pad_width);
if (edge) { if (edge) {
const int packed_end = packed_j + (pad_width - edge); const int packed_end = packed_j + (pad_width - edge);
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma loop_count min=1, max=15, avg=8 #pragma loop_count min=1, max=15, avg=8
#endif #endif
for ( ; packed_j < packed_end; packed_j++) for ( ; packed_j < packed_end; packed_j++)
jlist_form[packed_j] = nall; jlist_form[packed_j] = nall;
} }
// ------------------------------------------------------------- // -------------------------------------------------------------
#ifdef INTEL_V512 #ifdef INTEL_V512
__assume(packed_j % INTEL_VECTOR_WIDTH == 0); __assume(packed_j % INTEL_VECTOR_WIDTH == 0);
__assume(packed_j % 8 == 0); __assume(packed_j % 8 == 0);
__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0); __assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
#endif #endif
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \ #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
sevdwl,sv0,sv1,sv2,sv3,sv4,sv5) sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
#endif #endif
for (int jj = 0; jj < packed_j; jj++) { for (int jj = 0; jj < packed_j; jj++) {
flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8; flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
@ -458,15 +458,15 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
flt_t fforce_0, fforce_1, fforce_2, ttor_0, ttor_1, ttor_2; flt_t fforce_0, fforce_1, fforce_2, ttor_0, ttor_1, ttor_2;
flt_t rtor_0, rtor_1, rtor_2; flt_t rtor_0, rtor_1, rtor_2;
const int sbindex = jlist_form[jj] >> SBBITS & 3; const int sbindex = jlist_form[jj] >> SBBITS & 3;
const int j = jlist_form[jj] & NEIGHMASK; const int j = jlist_form[jj] & NEIGHMASK;
flt_t factor_lj = special_lj[sbindex]; flt_t factor_lj = special_lj[sbindex];
const int jtype = jtype_form[jj]; const int jtype = jtype_form[jj];
const flt_t sigma = ijci[jtype].sigma; const flt_t sigma = ijci[jtype].sigma;
const flt_t epsilon = ijci[jtype].epsilon; const flt_t epsilon = ijci[jtype].epsilon;
const flt_t shape2_0 = ic[jtype].shape2[0]; const flt_t shape2_0 = ic[jtype].shape2[0];
const flt_t shape2_1 = ic[jtype].shape2[1]; const flt_t shape2_1 = ic[jtype].shape2[1];
const flt_t shape2_2 = ic[jtype].shape2[2]; const flt_t shape2_2 = ic[jtype].shape2[2];
flt_t one_eng, evdwl; flt_t one_eng, evdwl;
ME_quat_to_mat_trans(quat[j], a2); ME_quat_to_mat_trans(quat[j], a2);
@ -488,7 +488,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
ME_plus3(g1, g2, g12); ME_plus3(g1, g2, g12);
flt_t kappa_0, kappa_1, kappa_2; flt_t kappa_0, kappa_1, kappa_2;
ME_mldivide3(g12, delx_form[jj], dely_form[jj], delz_form[jj], ME_mldivide3(g12, delx_form[jj], dely_form[jj], delz_form[jj],
kappa, ierror); kappa, ierror);
// tempv = G12^-1*r12hat // tempv = G12^-1*r12hat
@ -520,7 +520,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
flt_t iota_0, iota_1, iota_2; flt_t iota_0, iota_1, iota_2;
ME_plus3(b1, b2, b12); ME_plus3(b1, b2, b12);
ME_mldivide3(b12, delx_form[jj], dely_form[jj], delz_form[jj], ME_mldivide3(b12, delx_form[jj], dely_form[jj], delz_form[jj],
iota, ierror); iota, ierror);
// tempv = G12^-1*r12hat // tempv = G12^-1*r12hat
@ -534,7 +534,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
// compute dUr/dr // compute dUr/dr
temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) / temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
sigma; sigma;
temp1 = temp1 * (flt_t)24.0 * epsilon; temp1 = temp1 * (flt_t)24.0 * epsilon;
flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5; flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
flt_t dUr_0, dUr_1, dUr_2; flt_t dUr_0, dUr_1, dUr_2;
@ -548,8 +548,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
flt_t dchi_0, dchi_1, dchi_2; flt_t dchi_0, dchi_1, dchi_2;
temp1 = ME_dot3(iota, r12hat); temp1 = ME_dot3(iota, r12hat);
temp2 = (flt_t)-4.0 / rsq_form[jj] * mu * temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
std::pow(chi, (mu - (flt_t)1.0) / mu); std::pow(chi, (mu - (flt_t)1.0) / mu);
dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0); dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1); dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2); dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
@ -663,36 +663,36 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
temp3 = chi * eta; temp3 = chi * eta;
ttor_0 = (temp1 * dchi_0 + temp2 * deta_0 + temp3 * dUr_0) * ttor_0 = (temp1 * dchi_0 + temp2 * deta_0 + temp3 * dUr_0) *
(flt_t)-1.0; (flt_t)-1.0;
ttor_1 = (temp1 * dchi_1 + temp2 * deta_1 + temp3 * dUr_1) * ttor_1 = (temp1 * dchi_1 + temp2 * deta_1 + temp3 * dUr_1) *
(flt_t)-1.0; (flt_t)-1.0;
ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) * ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) *
(flt_t)-1.0; (flt_t)-1.0;
if (NEWTON_PAIR) { if (NEWTON_PAIR) {
rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) * rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) *
(flt_t)-1.0; (flt_t)-1.0;
rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) * rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) *
(flt_t)-1.0; (flt_t)-1.0;
rtor_2 = (temp1 * dchi2_2 + temp2 * deta2_2 + temp3 * dUr2_2) * rtor_2 = (temp1 * dchi2_2 + temp2 * deta2_2 + temp3 * dUr2_2) *
(flt_t)-1.0; (flt_t)-1.0;
} }
one_eng = temp1 * chi; one_eng = temp1 * chi;
#ifndef INTEL_VMASK #ifndef INTEL_VMASK
if (jlist_form[jj] == nall) { if (jlist_form[jj] == nall) {
one_eng = (flt_t)0.0; one_eng = (flt_t)0.0;
fforce_0 = 0.0; fforce_0 = 0.0;
fforce_1 = 0.0; fforce_1 = 0.0;
fforce_2 = 0.0; fforce_2 = 0.0;
ttor_0 = 0.0; ttor_0 = 0.0;
ttor_1 = 0.0; ttor_1 = 0.0;
ttor_2 = 0.0; ttor_2 = 0.0;
rtor_0 = 0.0; rtor_0 = 0.0;
rtor_1 = 0.0; rtor_1 = 0.0;
rtor_2 = 0.0; rtor_2 = 0.0;
} }
#endif #endif
fforce_0 *= factor_lj; fforce_0 *= factor_lj;
fforce_1 *= factor_lj; fforce_1 *= factor_lj;
@ -701,53 +701,53 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
ttor_1 *= factor_lj; ttor_1 *= factor_lj;
ttor_2 *= factor_lj; ttor_2 *= factor_lj;
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
if (jlist_form[jj] < nall) { if (jlist_form[jj] < nall) {
#endif #endif
fxtmp += fforce_0; fxtmp += fforce_0;
fytmp += fforce_1; fytmp += fforce_1;
fztmp += fforce_2; fztmp += fforce_2;
t1tmp += ttor_0; t1tmp += ttor_0;
t2tmp += ttor_1; t2tmp += ttor_1;
t3tmp += ttor_2; t3tmp += ttor_2;
if (NEWTON_PAIR) { if (NEWTON_PAIR) {
rtor_0 *= factor_lj; rtor_0 *= factor_lj;
rtor_1 *= factor_lj; rtor_1 *= factor_lj;
rtor_2 *= factor_lj; rtor_2 *= factor_lj;
int jp = j * 2; int jp = j * 2;
f[jp].x -= fforce_0; f[jp].x -= fforce_0;
f[jp].y -= fforce_1; f[jp].y -= fforce_1;
f[jp].z -= fforce_2; f[jp].z -= fforce_2;
jp++; jp++;
f[jp].x += rtor_0; f[jp].x += rtor_0;
f[jp].y += rtor_1; f[jp].y += rtor_1;
f[jp].z += rtor_2; f[jp].z += rtor_2;
} }
if (EFLAG) { if (EFLAG) {
evdwl = factor_lj * one_eng; evdwl = factor_lj * one_eng;
sevdwl += evdwl; sevdwl += evdwl;
if (eatom) { if (eatom) {
fwtmp += (flt_t)0.5 * evdwl; fwtmp += (flt_t)0.5 * evdwl;
if (NEWTON_PAIR) if (NEWTON_PAIR)
f[j*2].w += (flt_t)0.5 * evdwl; f[j*2].w += (flt_t)0.5 * evdwl;
} }
} }
if (NEWTON_PAIR == 0) { if (NEWTON_PAIR == 0) {
if (vflag == 1) { if (vflag == 1) {
sv0 += delx_form[jj] * fforce_0; sv0 += delx_form[jj] * fforce_0;
sv1 += dely_form[jj] * fforce_1; sv1 += dely_form[jj] * fforce_1;
sv2 += delz_form[jj] * fforce_2; sv2 += delz_form[jj] * fforce_2;
sv3 += delx_form[jj] * fforce_1; sv3 += delx_form[jj] * fforce_1;
sv4 += delx_form[jj] * fforce_2; sv4 += delx_form[jj] * fforce_2;
sv5 += dely_form[jj] * fforce_2; sv5 += dely_form[jj] * fforce_2;
} }
} // EVFLAG } // EVFLAG
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
} }
#endif #endif
} // for jj } // for jj
// ------------------------------------------------------------- // -------------------------------------------------------------
@ -756,29 +756,29 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
ierror = 2; ierror = 2;
int ip = i * 2; int ip = i * 2;
if (NEWTON_PAIR) { if (NEWTON_PAIR) {
f[ip].x += fxtmp; f[ip].x += fxtmp;
f[ip].y += fytmp; f[ip].y += fytmp;
f[ip].z += fztmp; f[ip].z += fztmp;
ip++; ip++;
f[ip].x += t1tmp; f[ip].x += t1tmp;
f[ip].y += t2tmp; f[ip].y += t2tmp;
f[ip].z += t3tmp; f[ip].z += t3tmp;
} else { } else {
f[ip].x = fxtmp; f[ip].x = fxtmp;
f[ip].y = fytmp; f[ip].y = fytmp;
f[ip].z = fztmp; f[ip].z = fztmp;
ip++; ip++;
f[ip].x = t1tmp; f[ip].x = t1tmp;
f[ip].y = t2tmp; f[ip].y = t2tmp;
f[ip].z = t3tmp; f[ip].z = t3tmp;
} }
if (EFLAG) { if (EFLAG) {
oevdwl += sevdwl; oevdwl += sevdwl;
if (eatom) f[i * 2].w += fwtmp; if (eatom) f[i * 2].w += fwtmp;
} }
if (NEWTON_PAIR == 0) { if (NEWTON_PAIR == 0) {
if (vflag == 1) { if (vflag == 1) {
ov0 += sv0; ov0 += sv0;
ov1 += sv1; ov1 += sv1;
@ -792,30 +792,30 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
int o_range; int o_range;
if (NEWTON_PAIR) { if (NEWTON_PAIR) {
o_range = nall; o_range = nall;
if (offload == 0) o_range -= minlocal; if (offload == 0) o_range -= minlocal;
IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads, IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
sizeof(FORCE_T)); sizeof(FORCE_T));
const int sto = iito * 8; const int sto = iito * 8;
const int fst4 = f_stride * 4; const int fst4 = f_stride * 4;
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp barrier #pragma omp barrier
#endif #endif
acc_t *f_scalar = &f_start[0].x; acc_t *f_scalar = &f_start[0].x;
acc_t *f_scalar2 = f_scalar + fst4; acc_t *f_scalar2 = f_scalar + fst4;
for (int t = 1; t < nthreads; t++) { for (int t = 1; t < nthreads; t++) {
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma simd #pragma simd
#endif #endif
for (int n = iifrom * 8; n < sto; n++) for (int n = iifrom * 8; n < sto; n++)
f_scalar[n] += f_scalar2[n]; f_scalar[n] += f_scalar2[n];
f_scalar2 += fst4; f_scalar2 += fst4;
} }
if (vflag==2) { if (vflag==2) {
const ATOM_T * _noalias const xo = x + minlocal; const ATOM_T * _noalias const xo = x + minlocal;
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma novector #pragma novector
#endif #endif
for (int n = iifrom; n < iito; n++) { for (int n = iifrom; n < iito; n++) {
const int nt2 = n * 2; const int nt2 = n * 2;
@ -826,7 +826,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
ov4 += f_start[nt2].z * xo[n].x; ov4 += f_start[nt2].z * xo[n].x;
ov5 += f_start[nt2].z * xo[n].y; ov5 += f_start[nt2].z * xo[n].y;
} }
} }
} }
if (ierror) if (ierror)
@ -840,12 +840,12 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
} }
if (vflag) { if (vflag) {
if (NEWTON_PAIR == 0) { if (NEWTON_PAIR == 0) {
ov0 *= (acc_t)-0.5; ov0 *= (acc_t)-0.5;
ov1 *= (acc_t)-0.5; ov1 *= (acc_t)-0.5;
ov2 *= (acc_t)-0.5; ov2 *= (acc_t)-0.5;
ov3 *= (acc_t)-0.5; ov3 *= (acc_t)-0.5;
ov4 *= (acc_t)-0.5; ov4 *= (acc_t)-0.5;
ov5 *= (acc_t)-0.5; ov5 *= (acc_t)-0.5;
} }
ev_global[2] = ov0; ev_global[2] = ov0;
ev_global[3] = ov1; ev_global[3] = ov1;
@ -982,7 +982,7 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
const int one_length, const int one_length,
const int nthreads, const int nthreads,
Memory *memory, Memory *memory,
const int cop) { const int cop) {
if (ntypes != _ntypes) { if (ntypes != _ntypes) {
if (_ntypes > 0) { if (_ntypes > 0) {
fc_packed3 *oic = ic; fc_packed3 *oic = ic;
@ -999,9 +999,9 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
int * ojlist_form = jlist_form[0]; int * ojlist_form = jlist_form[0];
if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL && if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
orsq_form != NULL && odelx_form != NULL && odely_form != NULL && orsq_form != NULL && odelx_form != NULL && odely_form != NULL &&
odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL && odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL &&
_cop >= 0) { _cop >= 0) {
#pragma offload_transfer target(mic:_cop) \ #pragma offload_transfer target(mic:_cop) \
nocopy(ospecial_lj, oijc, olj34, oic: alloc_if(0) free_if(1)) \ nocopy(ospecial_lj, oijc, olj34, oic: alloc_if(0) free_if(1)) \
nocopy(orsq_form, odelx_form, odely_form: alloc_if(0) free_if(1)) \ nocopy(orsq_form, odelx_form, odely_form: alloc_if(0) free_if(1)) \
@ -1033,14 +1033,14 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
memory->create(jlist_form, nthreads, one_length, "jlist_form"); memory->create(jlist_form, nthreads, one_length, "jlist_form");
for (int zn = 0; zn < nthreads; zn++) for (int zn = 0; zn < nthreads; zn++)
for (int zo = 0; zo < one_length; zo++) { for (int zo = 0; zo < one_length; zo++) {
rsq_form[zn][zo] = 10.0; rsq_form[zn][zo] = 10.0;
delx_form[zn][zo] = 10.0; delx_form[zn][zo] = 10.0;
dely_form[zn][zo] = 10.0; dely_form[zn][zo] = 10.0;
delz_form[zn][zo] = 10.0; delz_form[zn][zo] = 10.0;
jtype_form[zn][zo] = 1; jtype_form[zn][zo] = 1;
jlist_form[zn][zo] = 0; jlist_form[zn][zo] = 0;
} }
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
flt_t * ospecial_lj = special_lj; flt_t * ospecial_lj = special_lj;
@ -1057,9 +1057,9 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
int tp1sq = ntypes*ntypes; int tp1sq = ntypes*ntypes;
if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL && if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
oic != NULL && orsq_form != NULL && odelx_form != NULL && oic != NULL && orsq_form != NULL && odelx_form != NULL &&
odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL && odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL &&
ojlist_form !=NULL && cop >= 0) { ojlist_form !=NULL && cop >= 0) {
#pragma offload_transfer target(mic:cop) \ #pragma offload_transfer target(mic:cop) \
nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \ nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
nocopy(oijc,olj34: length(tp1sq) alloc_if(1) free_if(0)) \ nocopy(oijc,olj34: length(tp1sq) alloc_if(1) free_if(0)) \

View File

@ -67,8 +67,8 @@ void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag)
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag, void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
if (eflag || vflag) { if (eflag || vflag) {
ev_setup(eflag,vflag); ev_setup(eflag,vflag);
@ -125,9 +125,9 @@ void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc, const ForceConst<flt_t> &fc,
const int astart, const int aend) const int astart, const int aend)
{ {
const int inum = aend - astart; const int inum = aend - astart;
if (inum == 0) return; if (inum == 0) return;
@ -177,8 +177,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
// Determine how much data to transfer // Determine how much data to transfer
int x_size, q_size, f_stride, ev_size, separate_flag; int x_size, q_size, f_stride, ev_size, separate_flag;
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
buffers, offload, fix, separate_flag, buffers, offload, fix, separate_flag,
x_size, q_size, ev_size, f_stride); x_size, q_size, ev_size, f_stride);
int tc; int tc;
FORCE_T * _noalias f_start; FORCE_T * _noalias f_start;
@ -227,7 +227,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
#endif #endif
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
f_stride, x, q); f_stride, x, q);
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = oecoul = (acc_t)0; if (EFLAG) oevdwl = oecoul = (acc_t)0;
@ -259,7 +259,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
int * _noalias const tjtype = ccachej + toffs; int * _noalias const tjtype = ccachej + toffs;
for (int i = iifrom; i < iito; i += iip) { for (int i = iifrom; i < iito; i += iip) {
// const int i = ilist[ii]; // const int i = ilist[ii];
const int itype = x[i].w; const int itype = x[i].w;
const int ptr_off = itype * ntypes; const int ptr_off = itype * ntypes;
@ -270,175 +270,175 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
const int jnum = numneigh[i]; const int jnum = numneigh[i];
acc_t fxtmp,fytmp,fztmp,fwtmp; acc_t fxtmp,fytmp,fztmp,fwtmp;
acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
const flt_t xtmp = x[i].x; const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y; const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z; const flt_t ztmp = x[i].z;
const flt_t qtmp = q[i]; const flt_t qtmp = q[i];
fxtmp = fytmp = fztmp = (acc_t)0; fxtmp = fytmp = fztmp = (acc_t)0;
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
if (NEWTON_PAIR == 0) if (NEWTON_PAIR == 0)
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
int ej = 0; int ej = 0;
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma ivdep #pragma ivdep
#endif #endif
for (int jj = 0; jj < jnum; jj++) { for (int jj = 0; jj < jnum; jj++) {
const int j = jlist[jj] & NEIGHMASK; const int j = jlist[jj] & NEIGHMASK;
const flt_t delx = xtmp - x[j].x; const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y; const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z; const flt_t delz = ztmp - x[j].z;
const flt_t rsq = delx * delx + dely * dely + delz * delz; const flt_t rsq = delx * delx + dely * dely + delz * delz;
if (rsq < cut_coulsq) { if (rsq < cut_coulsq) {
trsq[ej]=rsq; trsq[ej]=rsq;
tdelx[ej]=delx; tdelx[ej]=delx;
tdely[ej]=dely; tdely[ej]=dely;
tdelz[ej]=delz; tdelz[ej]=delz;
tjtype[ej]=x[j].w; tjtype[ej]=x[j].w;
tj[ej]=jlist[jj]; tj[ej]=jlist[jj];
ej++; ej++;
} }
} }
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
sv0, sv1, sv2, sv3, sv4, sv5) sv0, sv1, sv2, sv3, sv4, sv5)
#endif #endif
for (int jj = 0; jj < ej; jj++) { for (int jj = 0; jj < ej; jj++) {
flt_t forcecoul, forcelj, evdwl, ecoul; flt_t forcecoul, forcelj, evdwl, ecoul;
forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0; forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
const int j = tj[jj] & NEIGHMASK; const int j = tj[jj] & NEIGHMASK;
const int sbindex = tj[jj] >> SBBITS & 3; const int sbindex = tj[jj] >> SBBITS & 3;
const int jtype = tjtype[jj]; const int jtype = tjtype[jj];
const flt_t rsq = trsq[jj]; const flt_t rsq = trsq[jj];
const flt_t r2inv = (flt_t)1.0 / rsq; const flt_t r2inv = (flt_t)1.0 / rsq;
#ifdef INTEL_ALLOW_TABLE #ifdef INTEL_ALLOW_TABLE
if (!ncoultablebits || rsq <= tabinnersq) { if (!ncoultablebits || rsq <= tabinnersq) {
#endif #endif
const flt_t A1 = 0.254829592; const flt_t A1 = 0.254829592;
const flt_t A2 = -0.284496736; const flt_t A2 = -0.284496736;
const flt_t A3 = 1.421413741; const flt_t A3 = 1.421413741;
const flt_t A4 = -1.453152027; const flt_t A4 = -1.453152027;
const flt_t A5 = 1.061405429; const flt_t A5 = 1.061405429;
const flt_t EWALD_F = 1.12837917; const flt_t EWALD_F = 1.12837917;
const flt_t INV_EWALD_P = 1.0 / 0.3275911; const flt_t INV_EWALD_P = 1.0 / 0.3275911;
const flt_t r = (flt_t)1.0 / sqrt(r2inv); const flt_t r = (flt_t)1.0 / sqrt(r2inv);
const flt_t grij = g_ewald * r; const flt_t grij = g_ewald * r;
const flt_t expm2 = exp(-grij * grij); const flt_t expm2 = exp(-grij * grij);
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
const flt_t prefactor = qqrd2e * qtmp * q[j] / r; const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
if (EFLAG) ecoul = prefactor * erfc; if (EFLAG) ecoul = prefactor * erfc;
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
prefactor; prefactor;
forcecoul -= adjust; forcecoul -= adjust;
if (EFLAG) ecoul -= adjust; if (EFLAG) ecoul -= adjust;
#ifdef INTEL_ALLOW_TABLE #ifdef INTEL_ALLOW_TABLE
} else { } else {
float rsq_lookup = rsq; float rsq_lookup = rsq;
const int itable = (__intel_castf32_u32(rsq_lookup) & const int itable = (__intel_castf32_u32(rsq_lookup) &
ncoulmask) >> ncoulshiftbits; ncoulmask) >> ncoulshiftbits;
const flt_t fraction = (rsq_lookup - table[itable].r) * const flt_t fraction = (rsq_lookup - table[itable].r) *
table[itable].dr; table[itable].dr;
const flt_t tablet = table[itable].f + const flt_t tablet = table[itable].f +
fraction * table[itable].df; fraction * table[itable].df;
forcecoul = qtmp * q[j] * tablet; forcecoul = qtmp * q[j] * tablet;
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] + if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
fraction * detable[itable]); fraction * detable[itable]);
if (sbindex) { if (sbindex) {
const flt_t table2 = ctable[itable] + const flt_t table2 = ctable[itable] +
fraction * dctable[itable]; fraction * dctable[itable];
const flt_t prefactor = qtmp * q[j] * table2; const flt_t prefactor = qtmp * q[j] * table2;
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) * const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
prefactor; prefactor;
forcecoul -= adjust; forcecoul -= adjust;
if (EFLAG) ecoul -= adjust; if (EFLAG) ecoul -= adjust;
} }
} }
#endif #endif
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
if (rsq < cut_ljsq) { if (rsq < cut_ljsq) {
#endif #endif
flt_t r6inv = r2inv * r2inv * r2inv; flt_t r6inv = r2inv * r2inv * r2inv;
forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y); forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w); if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
if (rsq > cut_lj_innersq) { if (rsq > cut_lj_innersq) {
#endif #endif
const flt_t drsq = cut_ljsq - rsq; const flt_t drsq = cut_ljsq - rsq;
const flt_t cut2 = (rsq - cut_lj_innersq) * drsq; const flt_t cut2 = (rsq - cut_lj_innersq) * drsq;
const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) * const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) *
inv_denom_lj; inv_denom_lj;
const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj; const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj;
if (EFLAG) { if (EFLAG) {
#ifndef INTEL_VMASK #ifndef INTEL_VMASK
if (rsq > cut_lj_innersq) { if (rsq > cut_lj_innersq) {
#endif #endif
forcelj = forcelj * switch1 + evdwl * switch2; forcelj = forcelj * switch1 + evdwl * switch2;
evdwl *= switch1; evdwl *= switch1;
#ifndef INTEL_VMASK #ifndef INTEL_VMASK
} }
#endif #endif
} else { } else {
const flt_t philj = r6inv * (lji[jtype].z*r6inv - const flt_t philj = r6inv * (lji[jtype].z*r6inv -
lji[jtype].w); lji[jtype].w);
#ifndef INTEL_VMASK #ifndef INTEL_VMASK
if (rsq > cut_lj_innersq) if (rsq > cut_lj_innersq)
#endif #endif
forcelj = forcelj * switch1 + philj * switch2; forcelj = forcelj * switch1 + philj * switch2;
} }
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
} }
#endif #endif
if (sbindex) { if (sbindex) {
const flt_t factor_lj = special_lj[sbindex]; const flt_t factor_lj = special_lj[sbindex];
forcelj *= factor_lj; forcelj *= factor_lj;
if (EFLAG) evdwl *= factor_lj; if (EFLAG) evdwl *= factor_lj;
} }
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
} }
#else #else
if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; } if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
#endif #endif
const flt_t fpair = (forcecoul + forcelj) * r2inv; const flt_t fpair = (forcecoul + forcelj) * r2inv;
const flt_t fpx = fpair * tdelx[jj]; const flt_t fpx = fpair * tdelx[jj];
fxtmp += fpx; fxtmp += fpx;
if (NEWTON_PAIR) f[j].x -= fpx; if (NEWTON_PAIR) f[j].x -= fpx;
const flt_t fpy = fpair * tdely[jj]; const flt_t fpy = fpair * tdely[jj];
fytmp += fpy; fytmp += fpy;
if (NEWTON_PAIR) f[j].y -= fpy; if (NEWTON_PAIR) f[j].y -= fpy;
const flt_t fpz = fpair * tdelz[jj]; const flt_t fpz = fpair * tdelz[jj];
fztmp += fpz; fztmp += fpz;
if (NEWTON_PAIR) f[j].z -= fpz; if (NEWTON_PAIR) f[j].z -= fpz;
if (EFLAG) { if (EFLAG) {
sevdwl += evdwl; sevdwl += evdwl;
secoul += ecoul; secoul += ecoul;
if (eatom) { if (eatom) {
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
if (NEWTON_PAIR) if (NEWTON_PAIR)
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
} }
} }
if (NEWTON_PAIR == 0) if (NEWTON_PAIR == 0)
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
fpx, fpy, fpz); fpx, fpy, fpz);
} // for jj } // for jj
if (NEWTON_PAIR) { if (NEWTON_PAIR) {
f[i].x += fxtmp; f[i].x += fxtmp;
@ -449,33 +449,33 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
f[i].y = fytmp; f[i].y = fytmp;
f[i].z = fztmp; f[i].z = fztmp;
} }
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
} // for ii } // for ii
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
ov4, ov5); ov4, ov5);
} // end of omp parallel region } // end of omp parallel region
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5); ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) { if (EFLAG) {
if (NEWTON_PAIR == 0) { if (NEWTON_PAIR == 0) {
oevdwl *= (acc_t)0.5; oevdwl *= (acc_t)0.5;
oecoul *= (acc_t)0.5; oecoul *= (acc_t)0.5;
} }
ev_global[0] = oevdwl; ev_global[0] = oevdwl;
ev_global[1] = oecoul; ev_global[1] = oecoul;
} }
if (vflag) { if (vflag) {
if (NEWTON_PAIR == 0) { if (NEWTON_PAIR == 0) {
ov0 *= (acc_t)0.5; ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5; ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5; ov2 *= (acc_t)0.5;
ov3 *= (acc_t)0.5; ov3 *= (acc_t)0.5;
ov4 *= (acc_t)0.5; ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5; ov5 *= (acc_t)0.5;
} }
ev_global[2] = ov0; ev_global[2] = ov0;
ev_global[3] = ov1; ev_global[3] = ov1;
@ -556,7 +556,7 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
double cut, cutneigh; double cut, cutneigh;
if (cut_lj > cut_coul) if (cut_lj > cut_coul)
error->all(FLERR, error->all(FLERR,
"Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic"); "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
for (int i = 1; i <= atom->ntypes; i++) { for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) { for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
@ -637,7 +637,7 @@ template <class flt_t>
void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
const int ntable, const int ntable,
Memory *memory, Memory *memory,
const int cop) { const int cop) {
if ( (ntypes != _ntypes || ntable != _ntable) ) { if ( (ntypes != _ntypes || ntable != _ntable) ) {
if (_ntypes > 0) { if (_ntypes > 0) {
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
@ -653,12 +653,12 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL && if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
otable != NULL && oetable != NULL && odetable != NULL && otable != NULL && oetable != NULL && odetable != NULL &&
octable != NULL && odctable != NULL && ospecial_coul != NULL && octable != NULL && odctable != NULL && ospecial_coul != NULL &&
cop >= 0) { cop >= 0) {
#pragma offload_transfer target(mic:cop) \ #pragma offload_transfer target(mic:cop) \
nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \ nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
nocopy(ocutsq, olj: alloc_if(0) free_if(1)) \ nocopy(ocutsq, olj: alloc_if(0) free_if(1)) \
nocopy(otable: alloc_if(0) free_if(1)) \ nocopy(otable: alloc_if(0) free_if(1)) \
nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1)) nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
} }
#endif #endif
@ -694,7 +694,7 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL && if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
otable !=NULL && oetable != NULL && odetable != NULL && otable !=NULL && oetable != NULL && odetable != NULL &&
octable != NULL && odctable != NULL && ospecial_coul != NULL && octable != NULL && odctable != NULL && ospecial_coul != NULL &&
cop >= 0) { cop >= 0) {
#pragma offload_transfer target(mic:cop) \ #pragma offload_transfer target(mic:cop) \
nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \ nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \ nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \

View File

@ -50,8 +50,8 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void eval(const int offload, const int vflag, void eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> * buffers, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc, const int astart, const int aend); const ForceConst<flt_t> &fc, const int astart, const int aend);
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc, void pack_force_const(ForceConst<flt_t> &fc,
@ -75,7 +75,7 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
~ForceConst() { set_ntypes(0,0,NULL,_cop); } ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
void set_ntypes(const int ntypes, const int ntable, Memory *memory, void set_ntypes(const int ntypes, const int ntable, Memory *memory,
const int cop); const int cop);
private: private:
int _ntypes, _ntable, _cop; int _ntypes, _ntable, _cop;

View File

@ -68,8 +68,8 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag)
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void PairLJCutCoulLongIntel::compute(int eflag, int vflag, void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
if (eflag || vflag) { if (eflag || vflag) {
ev_setup(eflag,vflag); ev_setup(eflag,vflag);
@ -92,7 +92,7 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
{ {
int ifrom, ito, tid; int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
packthreads, sizeof(ATOM_T)); packthreads, sizeof(ATOM_T));
buffers->thr_pack(ifrom,ito,ago); buffers->thr_pack(ifrom,ito,ago);
} }
fix->stop_watch(TIME_PACK); fix->stop_watch(TIME_PACK);
@ -124,9 +124,9 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc, const ForceConst<flt_t> &fc,
const int astart, const int aend) const int astart, const int aend)
{ {
const int inum = aend - astart; const int inum = aend - astart;
if (inum == 0) return; if (inum == 0) return;
@ -171,8 +171,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
// Determine how much data to transfer // Determine how much data to transfer
int x_size, q_size, f_stride, ev_size, separate_flag; int x_size, q_size, f_stride, ev_size, separate_flag;
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
buffers, offload, fix, separate_flag, buffers, offload, fix, separate_flag,
x_size, q_size, ev_size, f_stride); x_size, q_size, ev_size, f_stride);
int tc; int tc;
FORCE_T * _noalias f_start; FORCE_T * _noalias f_start;
@ -208,7 +208,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \ in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \ in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \ in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \ in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \ out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \ out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \ out(timer_compute:length(1) alloc_if(0) free_if(0)) \
@ -220,7 +220,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
#endif #endif
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
f_stride, x, q); f_stride, x, q);
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = oecoul = (acc_t)0; if (EFLAG) oevdwl = oecoul = (acc_t)0;
@ -261,18 +261,18 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
const int jnum = numneigh[i]; const int jnum = numneigh[i];
acc_t fxtmp,fytmp,fztmp,fwtmp; acc_t fxtmp,fytmp,fztmp,fwtmp;
acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
const flt_t xtmp = x[i].x; const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y; const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z; const flt_t ztmp = x[i].z;
const flt_t qtmp = q[i]; const flt_t qtmp = q[i];
fxtmp = fytmp = fztmp = (acc_t)0; fxtmp = fytmp = fztmp = (acc_t)0;
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
if (NEWTON_PAIR == 0) if (NEWTON_PAIR == 0)
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
int ej = 0; int ej = 0;
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma ivdep #pragma ivdep
@ -282,91 +282,91 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
const flt_t delx = xtmp - x[j].x; const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y; const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z; const flt_t delz = ztmp - x[j].z;
const int jtype = x[j].w; const int jtype = x[j].w;
const flt_t rsq = delx * delx + dely * dely + delz * delz; const flt_t rsq = delx * delx + dely * dely + delz * delz;
if (rsq < c_forcei[jtype].cutsq) { if (rsq < c_forcei[jtype].cutsq) {
trsq[ej]=rsq; trsq[ej]=rsq;
tdelx[ej]=delx; tdelx[ej]=delx;
tdely[ej]=dely; tdely[ej]=dely;
tdelz[ej]=delz; tdelz[ej]=delz;
tjtype[ej]=jtype; tjtype[ej]=jtype;
tj[ej]=jlist[jj]; tj[ej]=jlist[jj];
ej++; ej++;
} }
} }
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
sv0, sv1, sv2, sv3, sv4, sv5) sv0, sv1, sv2, sv3, sv4, sv5)
#endif #endif
for (int jj = 0; jj < ej; jj++) { for (int jj = 0; jj < ej; jj++) {
flt_t forcecoul, forcelj, evdwl, ecoul; flt_t forcecoul, forcelj, evdwl, ecoul;
forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0; forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
const int j = tj[jj] & NEIGHMASK; const int j = tj[jj] & NEIGHMASK;
const int sbindex = tj[jj] >> SBBITS & 3; const int sbindex = tj[jj] >> SBBITS & 3;
const int jtype = tjtype[jj]; const int jtype = tjtype[jj];
const flt_t rsq = trsq[jj]; const flt_t rsq = trsq[jj];
const flt_t r2inv = (flt_t)1.0 / rsq; const flt_t r2inv = (flt_t)1.0 / rsq;
#ifdef INTEL_ALLOW_TABLE #ifdef INTEL_ALLOW_TABLE
if (!ncoultablebits || rsq <= tabinnersq) { if (!ncoultablebits || rsq <= tabinnersq) {
#endif #endif
const flt_t A1 = 0.254829592; const flt_t A1 = 0.254829592;
const flt_t A2 = -0.284496736; const flt_t A2 = -0.284496736;
const flt_t A3 = 1.421413741; const flt_t A3 = 1.421413741;
const flt_t A4 = -1.453152027; const flt_t A4 = -1.453152027;
const flt_t A5 = 1.061405429; const flt_t A5 = 1.061405429;
const flt_t EWALD_F = 1.12837917; const flt_t EWALD_F = 1.12837917;
const flt_t INV_EWALD_P = 1.0 / 0.3275911; const flt_t INV_EWALD_P = 1.0 / 0.3275911;
const flt_t r = (flt_t)1.0 / sqrt(r2inv); const flt_t r = (flt_t)1.0 / sqrt(r2inv);
const flt_t grij = g_ewald * r; const flt_t grij = g_ewald * r;
const flt_t expm2 = exp(-grij * grij); const flt_t expm2 = exp(-grij * grij);
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
const flt_t prefactor = qqrd2e * qtmp * q[j] / r; const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
if (EFLAG) ecoul = prefactor * erfc; if (EFLAG) ecoul = prefactor * erfc;
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
prefactor; prefactor;
forcecoul -= adjust; forcecoul -= adjust;
if (EFLAG) ecoul -= adjust; if (EFLAG) ecoul -= adjust;
#ifdef INTEL_ALLOW_TABLE #ifdef INTEL_ALLOW_TABLE
} else { } else {
float rsq_lookup = rsq; float rsq_lookup = rsq;
const int itable = (__intel_castf32_u32(rsq_lookup) & const int itable = (__intel_castf32_u32(rsq_lookup) &
ncoulmask) >> ncoulshiftbits; ncoulmask) >> ncoulshiftbits;
const flt_t fraction = (rsq_lookup - table[itable].r) * const flt_t fraction = (rsq_lookup - table[itable].r) *
table[itable].dr; table[itable].dr;
const flt_t tablet = table[itable].f + const flt_t tablet = table[itable].f +
fraction * table[itable].df; fraction * table[itable].df;
forcecoul = qtmp * q[j] * tablet; forcecoul = qtmp * q[j] * tablet;
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] + if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
fraction * detable[itable]); fraction * detable[itable]);
if (sbindex) { if (sbindex) {
const flt_t table2 = ctable[itable] + const flt_t table2 = ctable[itable] +
fraction * dctable[itable]; fraction * dctable[itable];
const flt_t prefactor = qtmp * q[j] * table2; const flt_t prefactor = qtmp * q[j] * table2;
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) * const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
prefactor; prefactor;
forcecoul -= adjust; forcecoul -= adjust;
if (EFLAG) ecoul -= adjust; if (EFLAG) ecoul -= adjust;
} }
} }
#endif #endif
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
if (rsq < c_forcei[jtype].cut_ljsq) { if (rsq < c_forcei[jtype].cut_ljsq) {
#endif #endif
flt_t r6inv = r2inv * r2inv * r2inv; flt_t r6inv = r2inv * r2inv * r2inv;
forcelj = r6inv * (c_forcei[jtype].lj1 * r6inv - forcelj = r6inv * (c_forcei[jtype].lj1 * r6inv -
c_forcei[jtype].lj2); c_forcei[jtype].lj2);
if (EFLAG) evdwl = r6inv*(c_energyi[jtype].lj3 * r6inv - if (EFLAG) evdwl = r6inv*(c_energyi[jtype].lj3 * r6inv -
c_energyi[jtype].lj4) - c_energyi[jtype].lj4) -
c_energyi[jtype].offset; c_energyi[jtype].offset;
@ -376,14 +376,14 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
forcelj *= factor_lj; forcelj *= factor_lj;
if (EFLAG) evdwl *= factor_lj; if (EFLAG) evdwl *= factor_lj;
} }
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
} }
#else #else
if (rsq > c_forcei[jtype].cut_ljsq) if (rsq > c_forcei[jtype].cut_ljsq)
{ forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; } { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
#endif #endif
const flt_t fpair = (forcecoul + forcelj) * r2inv; const flt_t fpair = (forcecoul + forcelj) * r2inv;
const flt_t fpx = fpair * tdelx[jj]; const flt_t fpx = fpair * tdelx[jj];
fxtmp += fpx; fxtmp += fpx;
if (NEWTON_PAIR) f[j].x -= fpx; if (NEWTON_PAIR) f[j].x -= fpx;
@ -394,58 +394,58 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
fztmp += fpz; fztmp += fpz;
if (NEWTON_PAIR) f[j].z -= fpz; if (NEWTON_PAIR) f[j].z -= fpz;
if (EFLAG) { if (EFLAG) {
sevdwl += evdwl; sevdwl += evdwl;
secoul += ecoul; secoul += ecoul;
if (eatom) { if (eatom) {
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
if (NEWTON_PAIR) if (NEWTON_PAIR)
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
} }
} }
if (NEWTON_PAIR == 0) if (NEWTON_PAIR == 0)
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
fpx, fpy, fpz); fpx, fpy, fpz);
} // for jj } // for jj
if (NEWTON_PAIR) { if (NEWTON_PAIR) {
f[i].x += fxtmp; f[i].x += fxtmp;
f[i].y += fytmp; f[i].y += fytmp;
f[i].z += fztmp; f[i].z += fztmp;
} else { } else {
f[i].x = fxtmp; f[i].x = fxtmp;
f[i].y = fytmp; f[i].y = fytmp;
f[i].z = fztmp; f[i].z = fztmp;
} }
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
} // for ii } // for ii
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
ov4, ov5); ov4, ov5);
} // end of omp parallel region } // end of omp parallel region
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5); ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) { if (EFLAG) {
if (NEWTON_PAIR == 0) { if (NEWTON_PAIR == 0) {
oevdwl *= (acc_t)0.5; oevdwl *= (acc_t)0.5;
oecoul *= (acc_t)0.5; oecoul *= (acc_t)0.5;
} }
ev_global[0] = oevdwl; ev_global[0] = oevdwl;
ev_global[1] = oecoul; ev_global[1] = oecoul;
} }
if (vflag) { if (vflag) {
if (NEWTON_PAIR == 0) { if (NEWTON_PAIR == 0) {
ov0 *= (acc_t)0.5; ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5; ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5; ov2 *= (acc_t)0.5;
ov3 *= (acc_t)0.5; ov3 *= (acc_t)0.5;
ov4 *= (acc_t)0.5; ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5; ov5 *= (acc_t)0.5;
} }
ev_global[2] = ov0; ev_global[2] = ov0;
ev_global[3] = ov1; ev_global[3] = ov1;
ev_global[4] = ov2; ev_global[4] = ov2;
@ -547,8 +547,8 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
for (int i = 0; i < tp1; i++) { for (int i = 0; i < tp1; i++) {
for (int j = 0; j < tp1; j++) { for (int j = 0; j < tp1; j++) {
if (cutsq[i][j] < cut_ljsq[i][j]) if (cutsq[i][j] < cut_ljsq[i][j])
error->all(FLERR, error->all(FLERR,
"Intel variant of lj/cut/coul/long expects lj cutoff<=coulombic"); "Intel variant of lj/cut/coul/long expects lj cutoff<=coulombic");
fc.c_force[i][j].cutsq = cutsq[i][j]; fc.c_force[i][j].cutsq = cutsq[i][j];
fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j]; fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
fc.c_force[i][j].lj1 = lj1[i][j]; fc.c_force[i][j].lj1 = lj1[i][j];
@ -598,9 +598,9 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
template <class flt_t> template <class flt_t>
void PairLJCutCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, void PairLJCutCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
const int ntable, const int ntable,
Memory *memory, Memory *memory,
const int cop) { const int cop) {
if ( (ntypes != _ntypes || ntable != _ntable) ) { if ( (ntypes != _ntypes || ntable != _ntable) ) {
if (_ntypes > 0) { if (_ntypes > 0) {
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
@ -619,9 +619,9 @@ void PairLJCutCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
ospecial_coul != NULL && _cop >= 0) { ospecial_coul != NULL && _cop >= 0) {
#pragma offload_transfer target(mic:cop) \ #pragma offload_transfer target(mic:cop) \
nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \ nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \ nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
nocopy(otable: alloc_if(0) free_if(1)) \ nocopy(otable: alloc_if(0) free_if(1)) \
nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1)) nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
} }
#endif #endif

View File

@ -50,8 +50,8 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong {
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void eval(const int offload, const int vflag, void eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> * buffers, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc, const int astart, const int aend); const ForceConst<flt_t> &fc, const int astart, const int aend);
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc, void pack_force_const(ForceConst<flt_t> &fc,
@ -76,7 +76,7 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong {
~ForceConst() { set_ntypes(0,0,NULL,_cop); } ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
void set_ntypes(const int ntypes, const int ntable, Memory *memory, void set_ntypes(const int ntypes, const int ntable, Memory *memory,
const int cop); const int cop);
private: private:
int _ntypes, _ntable, _cop; int _ntypes, _ntable, _cop;

View File

@ -96,37 +96,37 @@ void PairLJCutIntel::compute(int eflag, int vflag,
if (_onetype) { if (_onetype) {
if (eflag) { if (eflag) {
if (force->newton_pair) { if (force->newton_pair) {
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else { } else {
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
} }
} else { } else {
if (force->newton_pair) { if (force->newton_pair) {
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else { } else {
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
} }
} }
} else { } else {
if (eflag) { if (eflag) {
if (force->newton_pair) { if (force->newton_pair) {
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end); eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum); eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else { } else {
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end); eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum); eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
} }
} else { } else {
if (force->newton_pair) { if (force->newton_pair) {
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end); eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum); eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else { } else {
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end); eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum); eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
} }
} }
} }
@ -161,8 +161,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
// Determine how much data to transfer // Determine how much data to transfer
int x_size, q_size, f_stride, ev_size, separate_flag; int x_size, q_size, f_stride, ev_size, separate_flag;
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
buffers, offload, fix, separate_flag, buffers, offload, fix, separate_flag,
x_size, q_size, ev_size, f_stride); x_size, q_size, ev_size, f_stride);
int tc; int tc;
FORCE_T * _noalias f_start; FORCE_T * _noalias f_start;
@ -176,7 +176,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
#endif #endif
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
f_stride, x, 0); f_stride, x, 0);
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = (acc_t)0; if (EFLAG) oevdwl = (acc_t)0;
@ -200,23 +200,23 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
flt_t cutsq, lj1, lj2, lj3, lj4, offset; flt_t cutsq, lj1, lj2, lj3, lj4, offset;
if (ONETYPE) { if (ONETYPE) {
cutsq = ljc12o[3].cutsq; cutsq = ljc12o[3].cutsq;
lj1 = ljc12o[3].lj1; lj1 = ljc12o[3].lj1;
lj2 = ljc12o[3].lj2; lj2 = ljc12o[3].lj2;
lj3 = lj34[3].lj3; lj3 = lj34[3].lj3;
lj4 = lj34[3].lj4; lj4 = lj34[3].lj4;
offset = ljc12o[3].offset; offset = ljc12o[3].offset;
} }
for (int i = iifrom; i < iito; i += iip) { for (int i = iifrom; i < iito; i += iip) {
int itype, ptr_off; int itype, ptr_off;
const FC_PACKED1_T * _noalias ljc12oi; const FC_PACKED1_T * _noalias ljc12oi;
const FC_PACKED2_T * _noalias lj34i; const FC_PACKED2_T * _noalias lj34i;
if (!ONETYPE) { if (!ONETYPE) {
itype = x[i].w; itype = x[i].w;
ptr_off = itype * ntypes; ptr_off = itype * ntypes;
ljc12oi = ljc12o + ptr_off; ljc12oi = ljc12o + ptr_off;
lj34i = lj34 + ptr_off; lj34i = lj34 + ptr_off;
} }
const int * _noalias const jlist = firstneigh + cnumneigh[i]; const int * _noalias const jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i]; const int jnum = numneigh[i];
@ -228,113 +228,113 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
const flt_t ytmp = x[i].y; const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z; const flt_t ztmp = x[i].z;
fxtmp = fytmp = fztmp = (acc_t)0; fxtmp = fytmp = fztmp = (acc_t)0;
if (EFLAG) fwtmp = sevdwl = (acc_t)0; if (EFLAG) fwtmp = sevdwl = (acc_t)0;
if (NEWTON_PAIR == 0) if (NEWTON_PAIR == 0)
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma vector aligned #pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5) sv0, sv1, sv2, sv3, sv4, sv5)
#endif #endif
for (int jj = 0; jj < jnum; jj++) { for (int jj = 0; jj < jnum; jj++) {
flt_t forcelj, evdwl; flt_t forcelj, evdwl;
forcelj = evdwl = (flt_t)0.0; forcelj = evdwl = (flt_t)0.0;
int j, jtype, sbindex; int j, jtype, sbindex;
if (!ONETYPE) { if (!ONETYPE) {
sbindex = jlist[jj] >> SBBITS & 3; sbindex = jlist[jj] >> SBBITS & 3;
j = jlist[jj] & NEIGHMASK; j = jlist[jj] & NEIGHMASK;
} else } else
j = jlist[jj]; j = jlist[jj];
const flt_t delx = xtmp - x[j].x; const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y; const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z; const flt_t delz = ztmp - x[j].z;
if (!ONETYPE) { if (!ONETYPE) {
jtype = x[j].w; jtype = x[j].w;
cutsq = ljc12oi[jtype].cutsq; cutsq = ljc12oi[jtype].cutsq;
} }
const flt_t rsq = delx * delx + dely * dely + delz * delz; const flt_t rsq = delx * delx + dely * dely + delz * delz;
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
if (rsq < cutsq) { if (rsq < cutsq) {
#endif #endif
flt_t factor_lj; flt_t factor_lj;
if (!ONETYPE) factor_lj = special_lj[sbindex]; if (!ONETYPE) factor_lj = special_lj[sbindex];
flt_t r2inv = 1.0 / rsq; flt_t r2inv = 1.0 / rsq;
flt_t r6inv = r2inv * r2inv * r2inv; flt_t r6inv = r2inv * r2inv * r2inv;
#ifndef INTEL_VMASK #ifndef INTEL_VMASK
if (rsq > cutsq) r6inv = (flt_t)0.0; if (rsq > cutsq) r6inv = (flt_t)0.0;
#endif #endif
if (!ONETYPE) { if (!ONETYPE) {
lj1 = ljc12oi[jtype].lj1; lj1 = ljc12oi[jtype].lj1;
lj2 = ljc12oi[jtype].lj2; lj2 = ljc12oi[jtype].lj2;
} }
forcelj = r6inv * (lj1 * r6inv - lj2); forcelj = r6inv * (lj1 * r6inv - lj2);
flt_t fpair; flt_t fpair;
if (!ONETYPE) if (!ONETYPE)
fpair = factor_lj * forcelj * r2inv; fpair = factor_lj * forcelj * r2inv;
else else
fpair = forcelj * r2inv; fpair = forcelj * r2inv;
const flt_t fpx = fpair * delx; const flt_t fpx = fpair * delx;
fxtmp += fpx; fxtmp += fpx;
if (NEWTON_PAIR) f[j].x -= fpx; if (NEWTON_PAIR) f[j].x -= fpx;
const flt_t fpy = fpair * dely; const flt_t fpy = fpair * dely;
fytmp += fpy; fytmp += fpy;
if (NEWTON_PAIR) f[j].y -= fpy; if (NEWTON_PAIR) f[j].y -= fpy;
const flt_t fpz = fpair * delz; const flt_t fpz = fpair * delz;
fztmp += fpz; fztmp += fpz;
if (NEWTON_PAIR) f[j].z -= fpz; if (NEWTON_PAIR) f[j].z -= fpz;
if (EFLAG) { if (EFLAG) {
if (!ONETYPE) { if (!ONETYPE) {
lj3 = lj34i[jtype].lj3; lj3 = lj34i[jtype].lj3;
lj4 = lj34i[jtype].lj4; lj4 = lj34i[jtype].lj4;
offset = ljc12oi[jtype].offset; offset = ljc12oi[jtype].offset;
} }
evdwl = r6inv * (lj3 * r6inv - lj4); evdwl = r6inv * (lj3 * r6inv - lj4);
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
evdwl -= offset; evdwl -= offset;
#else #else
if (rsq < cutsq) evdwl -= offset; if (rsq < cutsq) evdwl -= offset;
#endif #endif
if (!ONETYPE) evdwl *= factor_lj; if (!ONETYPE) evdwl *= factor_lj;
sevdwl += evdwl; sevdwl += evdwl;
if (eatom) { if (eatom) {
fwtmp += (flt_t)0.5 * evdwl; fwtmp += (flt_t)0.5 * evdwl;
if (NEWTON_PAIR) if (NEWTON_PAIR)
f[j].w += (flt_t)0.5 * evdwl; f[j].w += (flt_t)0.5 * evdwl;
} }
} }
if (NEWTON_PAIR == 0) if (NEWTON_PAIR == 0)
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz); IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
#ifdef INTEL_VMASK #ifdef INTEL_VMASK
} // if rsq } // if rsq
#endif #endif
} // for jj } // for jj
if (NEWTON_PAIR) { if (NEWTON_PAIR) {
f[i].x += fxtmp; f[i].x += fxtmp;
f[i].y += fytmp; f[i].y += fytmp;
f[i].z += fztmp; f[i].z += fztmp;
} else { } else {
f[i].x = fxtmp; f[i].x = fxtmp;
f[i].y = fytmp; f[i].y = fytmp;
f[i].z = fztmp; f[i].z = fztmp;
} }
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
} // for ii } // for ii
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
ov4, ov5); ov4, ov5);
} // end omp } // end omp
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5); ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) { if (EFLAG) {
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
@ -343,12 +343,12 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
} }
if (vflag) { if (vflag) {
if (NEWTON_PAIR == 0) { if (NEWTON_PAIR == 0) {
ov0 *= (acc_t)0.5; ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5; ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5; ov2 *= (acc_t)0.5;
ov3 *= (acc_t)0.5; ov3 *= (acc_t)0.5;
ov4 *= (acc_t)0.5; ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5; ov5 *= (acc_t)0.5;
} }
ev_global[2] = ov0; ev_global[2] = ov0;
ev_global[3] = ov1; ev_global[3] = ov1;
@ -454,7 +454,7 @@ void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
template <class flt_t> template <class flt_t>
void PairLJCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, void PairLJCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
Memory *memory, Memory *memory,
const int cop) { const int cop) {
if (ntypes != _ntypes) { if (ntypes != _ntypes) {
if (_ntypes > 0) { if (_ntypes > 0) {
fc_packed1 *oljc12o = ljc12o[0]; fc_packed1 *oljc12o = ljc12o[0];

View File

@ -1,50 +1,50 @@
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the GNU General Public License. the GNU General Public License.
See the README file in the top-level LAMMPS directory. See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
Contributing authors: William McDoniel (RWTH Aachen University) Contributing authors: William McDoniel (RWTH Aachen University)
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
#include <math.h> #include <math.h>
#include "pair_lj_long_coul_long_intel.h" #include "pair_lj_long_coul_long_intel.h"
#include "atom.h" #include "atom.h"
#include "comm.h" #include "comm.h"
#include "force.h" #include "force.h"
#include "group.h" #include "group.h"
#include "kspace.h" #include "kspace.h"
#include "memory.h" #include "memory.h"
#include "neighbor.h" #include "neighbor.h"
#include "neigh_list.h" #include "neigh_list.h"
#include "neigh_request.h" #include "neigh_request.h"
#include "memory.h" #include "memory.h"
#include "suffix.h" #include "suffix.h"
using namespace LAMMPS_NS; using namespace LAMMPS_NS;
#define C_FORCE_T typename ForceConst<flt_t>::c_force_t #define C_FORCE_T typename ForceConst<flt_t>::c_force_t
#define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t #define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
#define TABLE_T typename ForceConst<flt_t>::table_t #define TABLE_T typename ForceConst<flt_t>::table_t
PairLJLongCoulLongIntel::PairLJLongCoulLongIntel(LAMMPS *lmp) : PairLJLongCoulLongIntel::PairLJLongCoulLongIntel(LAMMPS *lmp) :
PairLJLongCoulLong(lmp) PairLJLongCoulLong(lmp)
{ {
suffix_flag |= Suffix::INTEL; suffix_flag |= Suffix::INTEL;
respa_enable = 0; respa_enable = 0;
cut_respa = NULL; cut_respa = NULL;
} }
PairLJLongCoulLongIntel::~PairLJLongCoulLongIntel() PairLJLongCoulLongIntel::~PairLJLongCoulLongIntel()
{ {
} }

View File

@ -1,39 +1,39 @@
/* *- c++ -*- ----------------------------------------------------------- /* *- c++ -*- -----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the GNU General Public License. the GNU General Public License.
See the README file in the top-level LAMMPS directory. See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
Contributing authors: William McDoniel (RWTH Aachen University) Contributing authors: William McDoniel (RWTH Aachen University)
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
#ifdef PAIR_CLASS #ifdef PAIR_CLASS
PairStyle(lj/long/coul/long/intel,PairLJLongCoulLongIntel) PairStyle(lj/long/coul/long/intel,PairLJLongCoulLongIntel)
#else #else
#ifndef LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H #ifndef LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
#define LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H #define LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
#include "pair_lj_long_coul_long.h" #include "pair_lj_long_coul_long.h"
#include "fix_intel.h" #include "fix_intel.h"
namespace LAMMPS_NS { namespace LAMMPS_NS {
class PairLJLongCoulLongIntel : public PairLJLongCoulLong { class PairLJLongCoulLongIntel : public PairLJLongCoulLong {
public: public:
PairLJLongCoulLongIntel(class LAMMPS *); PairLJLongCoulLongIntel(class LAMMPS *);
virtual ~PairLJLongCoulLongIntel(); virtual ~PairLJLongCoulLongIntel();
}; };
} }
#endif #endif
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -49,7 +49,7 @@ class PairSWIntel : public PairSW {
template <int SPQ, int ONETYPE, int EFLAG, class flt_t, class acc_t> template <int SPQ, int ONETYPE, int EFLAG, class flt_t, class acc_t>
void eval(const int offload, const int vflag, void eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc, IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
const int astart, const int aend, const int pad_width); const int astart, const int aend, const int pad_width);
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc, void pack_force_const(ForceConst<flt_t> &fc,

View File

@ -47,7 +47,7 @@ void PairTersoffIntel::init_style()
{ {
if (comm->me == 0) { if (comm->me == 0) {
error->warning(FLERR, "Tersoff/intel currently requires intel compiler. " error->warning(FLERR, "Tersoff/intel currently requires intel compiler. "
"Using MANYBODY version."); "Using MANYBODY version.");
} }
PairTersoff::init_style(); PairTersoff::init_style();
} }
@ -87,7 +87,7 @@ PairTersoffIntel::PairTersoffIntel(LAMMPS *lmp) : PairTersoff(lmp)
void PairTersoffIntel::compute(int eflag, int vflag) void PairTersoffIntel::compute(int eflag, int vflag)
{ {
if (fix->precision()==FixIntel::PREC_MODE_MIXED) { if (fix->precision()==FixIntel::PREC_MODE_MIXED) {
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
force_const_single); force_const_single);
} else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) { } else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) {
compute<double,double>(eflag, vflag, fix->get_double_buffers(), compute<double,double>(eflag, vflag, fix->get_double_buffers(),
@ -104,8 +104,8 @@ void PairTersoffIntel::compute(int eflag, int vflag)
// do we need to calculate energy/virial // do we need to calculate energy/virial
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void PairTersoffIntel::compute(int eflag, int vflag, void PairTersoffIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc) const ForceConst<flt_t> &fc)
{ {
if (eflag || vflag) { if (eflag || vflag) {
ev_setup(eflag,vflag); ev_setup(eflag,vflag);
@ -127,13 +127,13 @@ void PairTersoffIntel::compute(int eflag, int vflag,
#endif #endif
{ {
int ifrom, ito, tid; int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
packthreads, sizeof(ATOM_T)); packthreads, sizeof(ATOM_T));
buffers->thr_pack(ifrom,ito,ago); buffers->thr_pack(ifrom,ito,ago);
} }
fix->stop_watch(TIME_PACK); fix->stop_watch(TIME_PACK);
} }
int ovflag = 0; int ovflag = 0;
if (vflag_fdotr) ovflag = 2; if (vflag_fdotr) ovflag = 2;
else if (vflag) ovflag = 1; else if (vflag) ovflag = 1;
@ -170,14 +170,14 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
// what's done in here is that they are inlined and vectorized // what's done in here is that they are inlined and vectorized
// attractive() also provides an option to compute zeta as well // attractive() also provides an option to compute zeta as well
static fvec zeta_vector( static fvec zeta_vector(
const c_inner_t * param, const c_inner_t * param,
ivec xjw, bvec mask, ivec xjw, bvec mask,
fvec vrij, fvec rsq2, fvec vrij, fvec rsq2,
fvec vdijx, fvec vdijy, fvec vdijz, fvec vdijx, fvec vdijy, fvec vdijz,
fvec dikx, fvec diky, fvec dikz fvec dikx, fvec diky, fvec dikz
); );
static void force_zeta_vector( static void force_zeta_vector(
const c_outer_t * param, const c_outer_t * param,
ivec xjw, ivec xjw,
bvec mask, bvec mask,
fvec vrijsq, fvec vzeta_ij, fvec vrijsq, fvec vzeta_ij,
@ -202,14 +202,14 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
// perform the actual computation // perform the actual computation
template<bool EFLAG> template<bool EFLAG>
static void kernel( static void kernel(
int iito, int iifrom, int eatom, int vflag, int iito, int iifrom, int eatom, int vflag,
const int * _noalias const numneigh, const int * _noalias const numneigh,
const int * _noalias const numneighhalf, const int * _noalias const numneighhalf,
const int * _noalias const cnumneigh, const int * _noalias const cnumneigh,
const int * _noalias const firstneigh, int ntypes, const int * _noalias const firstneigh, int ntypes,
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x, typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
const c_inner_t * _noalias const c_inner, const c_inner_t * _noalias const c_inner,
const c_outer_t * _noalias const c_outer, const c_outer_t * _noalias const c_outer,
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
acc_t *evdwl acc_t *evdwl
); );
@ -217,14 +217,14 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
// perform one step of calculation, pass in i-j pairs of atoms (is, js) // perform one step of calculation, pass in i-j pairs of atoms (is, js)
template<int EFLAG> template<int EFLAG>
static void kernel_step( static void kernel_step(
int eatom, int vflag, int eatom, int vflag,
const int * _noalias const numneigh, const int * _noalias const numneigh,
const int * _noalias const cnumneigh, const int * _noalias const cnumneigh,
const int * _noalias const firstneigh, const int * _noalias const firstneigh,
int ntypes, int ntypes,
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x, typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
const c_inner_t * _noalias const c_inner, const c_inner_t * _noalias const c_inner,
const c_outer_t * _noalias const c_outer, const c_outer_t * _noalias const c_outer,
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
avec *vsevdwl, int compress_idx, iarr is, iarr js, bvec vmask_repulsive avec *vsevdwl, int compress_idx, iarr is, iarr js, bvec vmask_repulsive
); );
@ -233,12 +233,12 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
// with fixed i and a number of js // with fixed i and a number of js
template<int EFLAG> template<int EFLAG>
static void kernel_step_const_i( static void kernel_step_const_i(
int eatom, int vflag, int eatom, int vflag,
const int * _noalias const numneigh, const int * _noalias const cnumneigh, const int * _noalias const numneigh, const int * _noalias const cnumneigh,
const int * _noalias const firstneigh, int ntypes, const int * _noalias const firstneigh, int ntypes,
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x, typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
const c_inner_t * _noalias const c_inner, const c_inner_t * _noalias const c_inner,
const c_outer_t * _noalias const c_outer, const c_outer_t * _noalias const c_outer,
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
avec *vsevdwl, int compress_idx, int i, iarr js, bvec vmask_repulsive avec *vsevdwl, int compress_idx, int i, iarr js, bvec vmask_repulsive
); );
@ -255,9 +255,9 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
// This method is nearly identical to what happens in the other /intel styles // This method is nearly identical to what happens in the other /intel styles
template <int EFLAG, class flt_t, class acc_t> template <int EFLAG, class flt_t, class acc_t>
void PairTersoffIntel::eval(const int offload, const int vflag, void PairTersoffIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc, const ForceConst<flt_t> &fc,
const int astart, const int aend) const int astart, const int aend)
{ {
const int inum = aend - astart; const int inum = aend - astart;
if (inum == 0) return; if (inum == 0) return;
@ -289,8 +289,8 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
// Determine how much data to transfer // Determine how much data to transfer
int x_size, q_size, f_stride, ev_size, separate_flag; int x_size, q_size, f_stride, ev_size, separate_flag;
IP_PRE_get_transfern(ago, 1, EFLAG, vflag, IP_PRE_get_transfern(ago, 1, EFLAG, vflag,
buffers, offload, fix, separate_flag, buffers, offload, fix, separate_flag,
x_size, q_size, ev_size, f_stride); x_size, q_size, ev_size, f_stride);
int tc; int tc;
FORCE_T * _noalias f_start; FORCE_T * _noalias f_start;
@ -326,8 +326,8 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
#endif #endif
#endif #endif
IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall, IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall,
f_stride, x, 0); f_stride, x, 0);
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = oecoul = (acc_t)0; if (EFLAG) oevdwl = oecoul = (acc_t)0;
@ -354,7 +354,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
// Pick the variable i algorithm under specific conditions // Pick the variable i algorithm under specific conditions
// do use scalar algorithm with very short vectors // do use scalar algorithm with very short vectors
int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL; int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL;
bool pack_i = VL >= 8 && bool pack_i = VL >= 8 &&
lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops; lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops;
bool use_scalar = VL < 4; bool use_scalar = VL < 4;
if (use_scalar) { if (use_scalar) {
@ -364,16 +364,16 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
} else { } else {
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EFLAG>(ARGS); IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EFLAG>(ARGS);
} }
if (EFLAG) oevdwl += sevdwl; if (EFLAG) oevdwl += sevdwl;
} }
IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start,
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
ov4, ov5); ov4, ov5);
} // end of omp parallel region } // end of omp parallel region
IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag, IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5); ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) { if (EFLAG) {
ev_global[0] = oevdwl; ev_global[0] = oevdwl;
@ -431,7 +431,7 @@ void PairTersoffIntel::init_style()
error->all(FLERR, error->all(FLERR,
"The 'package intel' command is required for /intel styles"); "The 'package intel' command is required for /intel styles");
fix = static_cast<FixIntel *>(modify->fix[ifix]); fix = static_cast<FixIntel *>(modify->fix[ifix]);
fix->pair_init_check(); fix->pair_init_check();
fix->three_body_neighbor(1); fix->three_body_neighbor(1);
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
@ -481,25 +481,25 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
for (int k = 1; k < tp1; k++) { for (int k = 1; k < tp1; k++) {
Param * param = &params[elem2param[map[i]][map[j]][map[k]]]; Param * param = &params[elem2param[map[i]][map[j]][map[k]]];
fc.c_cutoff_inner[i][k][j].cutsq = static_cast<flt_t>(param->cutsq); fc.c_cutoff_inner[i][k][j].cutsq = static_cast<flt_t>(param->cutsq);
fc.c_inner_loop[i][j][k].lam3 = static_cast<flt_t>(param->lam3); fc.c_inner_loop[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
fc.c_inner_loop[i][j][k].bigr = static_cast<flt_t>(param->bigr); fc.c_inner_loop[i][j][k].bigr = static_cast<flt_t>(param->bigr);
fc.c_inner_loop[i][j][k].bigd = static_cast<flt_t>(param->bigd); fc.c_inner_loop[i][j][k].bigd = static_cast<flt_t>(param->bigd);
fc.c_inner_loop[i][j][k].c2 = static_cast<flt_t>(param->c * param->c); fc.c_inner_loop[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
fc.c_inner_loop[i][j][k].d2 = static_cast<flt_t>(param->d * param->d); fc.c_inner_loop[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
fc.c_inner_loop[i][j][k].h = static_cast<flt_t>(param->h); fc.c_inner_loop[i][j][k].h = static_cast<flt_t>(param->h);
fc.c_inner_loop[i][j][k].gamma = static_cast<flt_t>(param->gamma); fc.c_inner_loop[i][j][k].gamma = static_cast<flt_t>(param->gamma);
fc.c_inner_loop[i][j][k].powermint = static_cast<flt_t>(param->powermint); fc.c_inner_loop[i][j][k].powermint = static_cast<flt_t>(param->powermint);
fc.c_inner[i][j][k].cutsq = static_cast<flt_t>(param->cutsq); fc.c_inner[i][j][k].cutsq = static_cast<flt_t>(param->cutsq);
fc.c_inner[i][j][k].lam3 = static_cast<flt_t>(param->lam3); fc.c_inner[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
fc.c_inner[i][j][k].bigr = static_cast<flt_t>(param->bigr); fc.c_inner[i][j][k].bigr = static_cast<flt_t>(param->bigr);
fc.c_inner[i][j][k].bigd = static_cast<flt_t>(param->bigd); fc.c_inner[i][j][k].bigd = static_cast<flt_t>(param->bigd);
fc.c_inner[i][j][k].c2 = static_cast<flt_t>(param->c * param->c); fc.c_inner[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
fc.c_inner[i][j][k].d2 = static_cast<flt_t>(param->d * param->d); fc.c_inner[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
fc.c_inner[i][j][k].h = static_cast<flt_t>(param->h); fc.c_inner[i][j][k].h = static_cast<flt_t>(param->h);
fc.c_inner[i][j][k].gamma = static_cast<flt_t>(param->gamma); fc.c_inner[i][j][k].gamma = static_cast<flt_t>(param->gamma);
fc.c_inner[i][j][k].powermint = static_cast<flt_t>(param->powermint); fc.c_inner[i][j][k].powermint = static_cast<flt_t>(param->powermint);
} }
Param * param = &params[elem2param[map[i]][map[j]][map[j]]]; Param * param = &params[elem2param[map[i]][map[j]][map[j]]];
fc.c_cutoff_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq); fc.c_cutoff_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
@ -515,7 +515,7 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
fc.c_second_loop[i][j].c2 = static_cast<flt_t>(param->c2); fc.c_second_loop[i][j].c2 = static_cast<flt_t>(param->c2);
fc.c_second_loop[i][j].c3 = static_cast<flt_t>(param->c3); fc.c_second_loop[i][j].c3 = static_cast<flt_t>(param->c3);
fc.c_second_loop[i][j].c4 = static_cast<flt_t>(param->c4); fc.c_second_loop[i][j].c4 = static_cast<flt_t>(param->c4);
fc.c_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq); fc.c_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
fc.c_outer[i][j].bigr = static_cast<flt_t>(param->bigr); fc.c_outer[i][j].bigr = static_cast<flt_t>(param->bigr);
fc.c_outer[i][j].bigd = static_cast<flt_t>(param->bigd); fc.c_outer[i][j].bigd = static_cast<flt_t>(param->bigd);
@ -563,8 +563,8 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
// As in any other /intel pair style // As in any other /intel pair style
template <class flt_t> template <class flt_t>
void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
Memory *memory, Memory *memory,
const int cop) { const int cop) {
if ( (ntypes != _ntypes) ) { if ( (ntypes != _ntypes) ) {
if (_ntypes > 0) { if (_ntypes > 0) {
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
@ -575,12 +575,12 @@ void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
c_cutoff_t * oc_cutoff_outer = c_cutoff_outer[0]; c_cutoff_t * oc_cutoff_outer = c_cutoff_outer[0];
c_inner_t * oc_inner = c_inner[0][0]; c_inner_t * oc_inner = c_inner[0][0];
c_outer_t * oc_outer = c_outer[0]; c_outer_t * oc_outer = c_outer[0];
if (c_first_loop != NULL && c_second_loop != NULL && if (c_first_loop != NULL && c_second_loop != NULL &&
c_inner_loop != NULL && _cop >= 0) { c_inner_loop != NULL && _cop >= 0) {
#pragma offload_transfer target(mic:cop) \ #pragma offload_transfer target(mic:cop) \
nocopy(oc_first_loop, oc_second_loop, oc_inner_loop: alloc_if(0) free_if(1)) \ nocopy(oc_first_loop, oc_second_loop, oc_inner_loop: alloc_if(0) free_if(1)) \
nocopy(oc_cutoff_outer, oc_cutoff_inner: alloc_if(0) free_if(1)) \ nocopy(oc_cutoff_outer, oc_cutoff_inner: alloc_if(0) free_if(1)) \
nocopy(oc_inner, oc_outer: alloc_if(0) free_if(0)) nocopy(oc_inner, oc_outer: alloc_if(0) free_if(0))
} }
#endif #endif
@ -614,7 +614,7 @@ void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
int tp1sq = ntypes * ntypes; int tp1sq = ntypes * ntypes;
int tp1cb = ntypes * ntypes * ntypes; int tp1cb = ntypes * ntypes * ntypes;
int tp1cb_pad = ntypes * ntypes * ntypes_pad; int tp1cb_pad = ntypes * ntypes * ntypes_pad;
if (oc_first_loop != NULL && oc_second_loop != NULL && if (oc_first_loop != NULL && oc_second_loop != NULL &&
oc_inner_loop != NULL && cop >= 0) { oc_inner_loop != NULL && cop >= 0) {
#pragma offload_transfer target(mic:cop) \ #pragma offload_transfer target(mic:cop) \
nocopy(oc_first_loop: length(tp1sq) alloc_if(1) free_if(0)) \ nocopy(oc_first_loop: length(tp1sq) alloc_if(1) free_if(0)) \
@ -642,15 +642,15 @@ static const int N_CACHE = 8;
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i> template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
template<int EFLAG> template<int EFLAG>
void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
int eatom, int vflag, int eatom, int vflag,
const int * _noalias const numneigh, const int * _noalias const cnumneigh, const int * _noalias const numneigh, const int * _noalias const cnumneigh,
const int * _noalias const firstneigh, int ntypes, const int * _noalias const firstneigh, int ntypes,
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x, typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner, const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer, const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
avec *vsevdwl, avec *vsevdwl,
int compress_idx, int compress_idx,
iarr is, iarr is,
iarr js, iarr js,
bvec vmask_repulsive bvec vmask_repulsive
@ -662,7 +662,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
ivec v_i0(0); ivec v_i0(0);
ivec v_i_ntypes(ntypes); ivec v_i_ntypes(ntypes);
ivec v_i_NEIGHMASK(NEIGHMASK); ivec v_i_NEIGHMASK(NEIGHMASK);
farr fx, fy, fz, fw; farr fx, fy, fz, fw;
int cache_idx = 0; int cache_idx = 0;
fvec vfkx_cache[N_CACHE]; fvec vfkx_cache[N_CACHE];
@ -672,7 +672,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
bvec vmask_cache[N_CACHE]; bvec vmask_cache[N_CACHE];
ivec vkks_final_cache; ivec vkks_final_cache;
bvec vmask_final_cache; bvec vmask_final_cache;
iarr ts; iarr ts;
// compute all the stuff we know from i and j // compute all the stuff we know from i and j
// TDO: We could extract this from the driver routine // TDO: We could extract this from the driver routine
ivec vis = v::int_mullo(v_i4floats, v::int_load_vl(is)); ivec vis = v::int_mullo(v_i4floats, v::int_load_vl(is));
@ -738,7 +738,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
&vfix,&vfiy,&vfiz, &vfix,&vfiy,&vfiz,
&vfjx,&vfjy,&vfjz, &vfjx,&vfjy,&vfjz,
&vfkx,&vfky,&vfkz, &vfkx,&vfky,&vfkz,
&vzeta_contrib); &vzeta_contrib);
vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix); vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy); vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz); vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
@ -749,9 +749,9 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
vfkx_cache[cache_idx] = vfkx; vfkx_cache[cache_idx] = vfkx;
vfky_cache[cache_idx] = vfky; vfky_cache[cache_idx] = vfky;
vfkz_cache[cache_idx] = vfkz; vfkz_cache[cache_idx] = vfkz;
vks_cache[cache_idx] = vks; vks_cache[cache_idx] = vks;
vmask_cache[cache_idx] = veff_mask; vmask_cache[cache_idx] = veff_mask;
cache_idx += 1; cache_idx += 1;
vzeta = v::mask_add(vzeta, veff_mask, vzeta, vzeta_contrib); vzeta = v::mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
vkks = vkks + v_i1; vkks = vkks + v_i1;
@ -799,7 +799,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
vfjxtmp = vfjxtmp * vprefactor - vdx_ij * vfpair; vfjxtmp = vfjxtmp * vprefactor - vdx_ij * vfpair;
vfjytmp = vfjytmp * vprefactor - vdy_ij * vfpair; vfjytmp = vfjytmp * vprefactor - vdy_ij * vfpair;
vfjztmp = vfjztmp * vprefactor - vdz_ij * vfpair; vfjztmp = vfjztmp * vprefactor - vdz_ij * vfpair;
if (EFLAG) { if (EFLAG) {
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl); *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
if (eatom) { if (eatom) {
@ -833,7 +833,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
fvec vx_k, vy_k, vz_k, vcutsq; fvec vx_k, vy_k, vz_k, vcutsq;
while (! v::mask_testz(vactive_mask)) { while (! v::mask_testz(vactive_mask)) {
bvec vnew_mask = vactive_mask & ~ veff_old_mask; bvec vnew_mask = vactive_mask & ~ veff_old_mask;
vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK & vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh)); v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k); v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
fvec vdx_ik = vx_k - vx_i; fvec vdx_ik = vx_k - vx_i;
@ -855,7 +855,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
&vfix,&vfiy,&vfiz, &vfix,&vfiy,&vfiz,
&vfjx,&vfjy,&vfjz, &vfjx,&vfjy,&vfjz,
&vfkx,&vfky,&vfkz, &vfkx,&vfky,&vfkz,
0); 0);
vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix); vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy); vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz); vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
@ -917,15 +917,15 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i> template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
template<int EFLAG> template<int EFLAG>
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i( void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
int eatom, int vflag, int eatom, int vflag,
const int * _noalias const numneigh, const int * _noalias const cnumneigh, const int * _noalias const numneigh, const int * _noalias const cnumneigh,
const int * _noalias const firstneigh, int ntypes, const int * _noalias const firstneigh, int ntypes,
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x, typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner, const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer, const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
avec *vsevdwl, avec *vsevdwl,
int compress_idx, int compress_idx,
int i, int i,
iarr js, iarr js,
bvec vmask_repulsive bvec vmask_repulsive
@ -951,7 +951,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
int kk_final_cache; int kk_final_cache;
aarr fx, fy, fz, fw; aarr fx, fy, fz, fw;
iarr ts; iarr ts;
bvec vmask = v::mask_enable_lower(compress_idx); bvec vmask = v::mask_enable_lower(compress_idx);
fvec vx_i(x[i].x), vy_i(x[i].y), vz_i(x[i].z); fvec vx_i(x[i].x), vy_i(x[i].y), vz_i(x[i].z);
@ -997,7 +997,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
fvec vfix, vfiy, vfiz; fvec vfix, vfiy, vfiz;
fvec vfjx, vfjy, vfjz; fvec vfjx, vfjy, vfjz;
fvec vfkx, vfky, vfkz; fvec vfkx, vfky, vfkz;
attractive_vector<true>(&c_inner[ntypes * ntypes * w_i + w_k],vc_idx_j_ntypes,veff_mask,fvec(1.), attractive_vector<true>(&c_inner[ntypes * ntypes * w_i + w_k],vc_idx_j_ntypes,veff_mask,fvec(1.),
vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik, vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
&vfix,&vfiy,&vfiz, &vfix,&vfiy,&vfiz,
@ -1010,7 +1010,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
vfjxtmp = v::acc_mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx); vfjxtmp = v::acc_mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
vfjytmp = v::acc_mask_add(vfjytmp, veff_mask, vfjytmp, vfjy); vfjytmp = v::acc_mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
vfjztmp = v::acc_mask_add(vfjztmp, veff_mask, vfjztmp, vfjz); vfjztmp = v::acc_mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
vfkx_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkx, v::zero()); vfkx_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkx, v::zero());
vfky_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfky, v::zero()); vfky_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfky, v::zero());
vfkz_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkz, v::zero()); vfkz_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkz, v::zero());
@ -1037,7 +1037,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k))); bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
bvec veff_mask = vcutoff_mask & vsame_mask & vmask; bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
if (! v::mask_testz(veff_mask)) { if (! v::mask_testz(veff_mask)) {
fvec vzeta_contrib = zeta_vector(&c_inner[ntypes * ntypes * w_i + w_k], vc_idx_j_ntypes, veff_mask, vrij, vrsq, fvec vzeta_contrib = zeta_vector(&c_inner[ntypes * ntypes * w_i + w_k], vc_idx_j_ntypes, veff_mask, vrij, vrsq,
vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik); vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik);
vzeta = v::acc_mask_add(vzeta, veff_mask, vzeta, vzeta_contrib); vzeta = v::acc_mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
} }
@ -1051,7 +1051,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
vfjxtmp = vfjxtmp * vaprefactor - avec(vdx_ij * vfpair); vfjxtmp = vfjxtmp * vaprefactor - avec(vdx_ij * vfpair);
vfjytmp = vfjytmp * vaprefactor - avec(vdy_ij * vfpair); vfjytmp = vfjytmp * vaprefactor - avec(vdy_ij * vfpair);
vfjztmp = vfjztmp * vaprefactor - avec(vdz_ij * vfpair); vfjztmp = vfjztmp * vaprefactor - avec(vdz_ij * vfpair);
if (EFLAG) { if (EFLAG) {
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl); *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
if (eatom) { if (eatom) {
@ -1093,7 +1093,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
&vfix,&vfiy,&vfiz, &vfix,&vfiy,&vfiz,
&vfjx,&vfjy,&vfjz, &vfjx,&vfjy,&vfjz,
&vfkx,&vfky,&vfkz, &vfkx,&vfky,&vfkz,
0); 0);
vfxtmp = v::acc_mask_add(vfxtmp, veff_mask, vfxtmp, vfix); vfxtmp = v::acc_mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
vfytmp = v::acc_mask_add(vfytmp, veff_mask, vfytmp, vfiy); vfytmp = v::acc_mask_add(vfytmp, veff_mask, vfytmp, vfiy);
vfztmp = v::acc_mask_add(vfztmp, veff_mask, vfztmp, vfiz); vfztmp = v::acc_mask_add(vfztmp, veff_mask, vfztmp, vfiz);
@ -1129,14 +1129,14 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i> template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
template<bool EFLAG> template<bool EFLAG>
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
int iito, int iifrom, int eatom, int vflag, int iito, int iifrom, int eatom, int vflag,
const int * _noalias const numneigh, const int * _noalias const numneigh,
const int * _noalias const numneighhalf, const int * _noalias const numneighhalf,
const int * _noalias const cnumneigh, const int * _noalias const cnumneigh,
const int * _noalias const firstneigh, int ntypes, const int * _noalias const firstneigh, int ntypes,
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x, typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
const c_inner_t * _noalias const c_inner, const c_inner_t * _noalias const c_inner,
const c_outer_t * _noalias const c_outer, const c_outer_t * _noalias const c_outer,
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
acc_t *evdwl acc_t *evdwl
) { ) {
@ -1181,10 +1181,10 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
if (compress_idx == v::VL) { if (compress_idx == v::VL) {
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0)); vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
kernel_step<EFLAG>( kernel_step<EFLAG>(
eatom, vflag, eatom, vflag,
numneigh, cnumneigh, firstneigh, ntypes, numneigh, cnumneigh, firstneigh, ntypes,
x, c_inner, c_outer, f, x, c_inner, c_outer, f,
&vsevdwl, compress_idx, &vsevdwl, compress_idx,
is, js, vmask_repulsive is, js, vmask_repulsive
); );
compress_idx = 0; compress_idx = 0;
@ -1194,10 +1194,10 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
if (compress_idx == v::VL || (compress_idx > 0 && jj == jnum-1)) { if (compress_idx == v::VL || (compress_idx > 0 && jj == jnum-1)) {
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0)); vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
kernel_step_const_i<EFLAG>( kernel_step_const_i<EFLAG>(
eatom, vflag, eatom, vflag,
numneigh, cnumneigh, firstneigh, ntypes, numneigh, cnumneigh, firstneigh, ntypes,
x, c_inner, c_outer, f, x, c_inner, c_outer, f,
&vsevdwl, compress_idx, &vsevdwl, compress_idx,
i, js, vmask_repulsive i, js, vmask_repulsive
); );
compress_idx = 0; compress_idx = 0;
@ -1209,10 +1209,10 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
if (compress_idx > 0) { if (compress_idx > 0) {
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0)); vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
IntelKernelTersoff::kernel_step<EFLAG>( IntelKernelTersoff::kernel_step<EFLAG>(
eatom, vflag, eatom, vflag,
numneigh, cnumneigh, firstneigh, ntypes, numneigh, cnumneigh, firstneigh, ntypes,
x, c_inner, c_outer, f, x, c_inner, c_outer, f,
&vsevdwl, compress_idx, &vsevdwl, compress_idx,
is, js, vmask_repulsive is, js, vmask_repulsive
); );
} }
@ -1224,10 +1224,10 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i> template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::zeta_vector( IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::zeta_vector(
const c_inner_t * param, const c_inner_t * param,
ivec xjw, bvec mask, ivec xjw, bvec mask,
fvec vrij, fvec rsq2, fvec vrij, fvec rsq2,
fvec vdijx, fvec vdijy, fvec vdijz, fvec vdijx, fvec vdijy, fvec vdijz,
fvec dikx, fvec diky, fvec dikz fvec dikx, fvec diky, fvec dikz
) { ) {
fvec v_1_0(1.0); fvec v_1_0(1.0);
@ -1250,7 +1250,7 @@ IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t
// Its kind of important to check the mask. // Its kind of important to check the mask.
// Some simulations never/rarely invoke this branch. // Some simulations never/rarely invoke this branch.
if (! v::mask_testz(vmask_need_sine)) { if (! v::mask_testz(vmask_need_sine)) {
vfc = v::blend(vmask_need_sine, vfc, vfc = v::blend(vmask_need_sine, vfc,
v_0_5 * (v_1_0 - sin(fvec(MY_PI2) * (vrik - vpbigr) * v::recip(vpbigd)))); v_0_5 * (v_1_0 - sin(fvec(MY_PI2) * (vrik - vpbigr) * v::recip(vpbigd))));
} }
return vgijk * vex_delr * vfc; return vgijk * vex_delr * vfc;
@ -1258,7 +1258,7 @@ IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i> template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::force_zeta_vector( void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::force_zeta_vector(
const c_outer_t * param, const c_outer_t * param,
ivec xjw, ivec xjw,
bvec mask, bvec mask,
fvec vrij, fvec vzeta_ij, fvec vrij, fvec vzeta_ij,
@ -1402,9 +1402,9 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
vfc_d = v::blend(vmask_need_sine, vfc_d, fvec(-0.5) * vtmp * vfccos); vfc_d = v::blend(vmask_need_sine, vfc_d, fvec(-0.5) * vtmp * vfccos);
} }
fvec vzeta_d_fc = vfc_d * vgijk * vex_delr; fvec vzeta_d_fc = vfc_d * vgijk * vex_delr;
fvec vzeta_d_gijk = vfc * vgijk_d * vex_delr; fvec vzeta_d_gijk = vfc * vgijk_d * vex_delr;
fvec vzeta_d_ex_delr = vfc * vgijk * vex_delr_d; fvec vzeta_d_ex_delr = vfc * vgijk * vex_delr_d;
if (ZETA) *zeta = vfc * vgijk * vex_delr; if (ZETA) *zeta = vfc * vgijk * vex_delr;
fvec vminus_costheta = - vcostheta; fvec vminus_costheta = - vcostheta;
@ -1417,7 +1417,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
fvec vdcosdrix = -(vdcosdrjx + vdcosdrkx); fvec vdcosdrix = -(vdcosdrjx + vdcosdrkx);
fvec vdcosdriy = -(vdcosdrjy + vdcosdrky); fvec vdcosdriy = -(vdcosdrjy + vdcosdrky);
fvec vdcosdriz = -(vdcosdrjz + vdcosdrkz); fvec vdcosdriz = -(vdcosdrjz + vdcosdrkz);
*fix = vprefactor * (vzeta_d_gijk * vdcosdrix + vzeta_d_ex_delr * (rik_hatx - vrij_hatx) - vzeta_d_fc * rik_hatx); *fix = vprefactor * (vzeta_d_gijk * vdcosdrix + vzeta_d_ex_delr * (rik_hatx - vrij_hatx) - vzeta_d_fc * rik_hatx);
*fiy = vprefactor * (vzeta_d_gijk * vdcosdriy + vzeta_d_ex_delr * (rik_haty - vrij_haty) - vzeta_d_fc * rik_haty); *fiy = vprefactor * (vzeta_d_gijk * vdcosdriy + vzeta_d_ex_delr * (rik_haty - vrij_haty) - vzeta_d_fc * rik_haty);
*fiz = vprefactor * (vzeta_d_gijk * vdcosdriz + vzeta_d_ex_delr * (rik_hatz - vrij_hatz) - vzeta_d_fc * rik_hatz); *fiz = vprefactor * (vzeta_d_gijk * vdcosdriz + vzeta_d_ex_delr * (rik_hatz - vrij_hatz) - vzeta_d_fc * rik_hatz);

View File

@ -75,14 +75,14 @@ class PairTersoffIntel : public PairTersoff {
}; };
ForceConst<float> force_const_single; ForceConst<float> force_const_single;
ForceConst<double> force_const_double; ForceConst<double> force_const_double;
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc); const ForceConst<flt_t> &fc);
template <int EFLAG, class flt_t, class acc_t> template <int EFLAG, class flt_t, class acc_t>
void eval(const int offload, const int vflag, void eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> * buffers, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc, const int astart, const int aend); const ForceConst<flt_t> &fc, const int astart, const int aend);
template <class flt_t, class acc_t> template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc, void pack_force_const(ForceConst<flt_t> &fc,

File diff suppressed because it is too large Load Diff

View File

@ -1,238 +1,238 @@
/* -*- c++ -*- ---------------------------------------------------------- /* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under certain rights in this software. This software is distributed under
the GNU General Public License. the GNU General Public License.
See the README file in the top-level LAMMPS directory. See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
Contributing authors: William McDoniel (RWTH Aachen University) Contributing authors: William McDoniel (RWTH Aachen University)
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
#ifdef KSPACE_CLASS #ifdef KSPACE_CLASS
KSpaceStyle(pppm/disp/intel,PPPMDispIntel) KSpaceStyle(pppm/disp/intel,PPPMDispIntel)
#else #else
#ifndef LMP_PPPMINTEL_DISP_H #ifndef LMP_PPPMINTEL_DISP_H
#define LMP_PPPMINTEL_DISP_H #define LMP_PPPMINTEL_DISP_H
#include "pppm_disp.h" #include "pppm_disp.h"
#include "fix_intel.h" #include "fix_intel.h"
namespace LAMMPS_NS { namespace LAMMPS_NS {
class PPPMDispIntel : public PPPMDisp { class PPPMDispIntel : public PPPMDisp {
public: public:
PPPMDispIntel(class LAMMPS *, int, char **); PPPMDispIntel(class LAMMPS *, int, char **);
virtual ~PPPMDispIntel(); virtual ~PPPMDispIntel();
virtual void init(); virtual void init();
virtual void compute(int, int); virtual void compute(int, int);
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
int use_base(); int use_base();
#endif #endif
protected: protected:
FixIntel *fix; FixIntel *fix;
int _use_lrt; int _use_lrt;
FFT_SCALAR **perthread_density; FFT_SCALAR **perthread_density;
FFT_SCALAR *particle_ekx; FFT_SCALAR *particle_ekx;
FFT_SCALAR *particle_eky; FFT_SCALAR *particle_eky;
FFT_SCALAR *particle_ekz; FFT_SCALAR *particle_ekz;
FFT_SCALAR *particle_ekx0; FFT_SCALAR *particle_ekx0;
FFT_SCALAR *particle_eky0; FFT_SCALAR *particle_eky0;
FFT_SCALAR *particle_ekz0; FFT_SCALAR *particle_ekz0;
FFT_SCALAR *particle_ekx1; FFT_SCALAR *particle_ekx1;
FFT_SCALAR *particle_eky1; FFT_SCALAR *particle_eky1;
FFT_SCALAR *particle_ekz1; FFT_SCALAR *particle_ekz1;
FFT_SCALAR *particle_ekx2; FFT_SCALAR *particle_ekx2;
FFT_SCALAR *particle_eky2; FFT_SCALAR *particle_eky2;
FFT_SCALAR *particle_ekz2; FFT_SCALAR *particle_ekz2;
FFT_SCALAR *particle_ekx3; FFT_SCALAR *particle_ekx3;
FFT_SCALAR *particle_eky3; FFT_SCALAR *particle_eky3;
FFT_SCALAR *particle_ekz3; FFT_SCALAR *particle_ekz3;
FFT_SCALAR *particle_ekx4; FFT_SCALAR *particle_ekx4;
FFT_SCALAR *particle_eky4; FFT_SCALAR *particle_eky4;
FFT_SCALAR *particle_ekz4; FFT_SCALAR *particle_ekz4;
FFT_SCALAR *particle_ekx5; FFT_SCALAR *particle_ekx5;
FFT_SCALAR *particle_eky5; FFT_SCALAR *particle_eky5;
FFT_SCALAR *particle_ekz5; FFT_SCALAR *particle_ekz5;
FFT_SCALAR *particle_ekx6; FFT_SCALAR *particle_ekx6;
FFT_SCALAR *particle_eky6; FFT_SCALAR *particle_eky6;
FFT_SCALAR *particle_ekz6; FFT_SCALAR *particle_ekz6;
int _use_table; int _use_table;
int rho_points; int rho_points;
FFT_SCALAR **rho_lookup; FFT_SCALAR **rho_lookup;
FFT_SCALAR **rho6_lookup; FFT_SCALAR **rho6_lookup;
FFT_SCALAR **drho_lookup; FFT_SCALAR **drho_lookup;
FFT_SCALAR **drho6_lookup; FFT_SCALAR **drho6_lookup;
FFT_SCALAR half_rho_scale, half_rho_scale_plus; FFT_SCALAR half_rho_scale, half_rho_scale_plus;
int _use_packing; int _use_packing;
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
int _use_base; int _use_base;
#endif #endif
template<class flt_t, class acc_t> template<class flt_t, class acc_t>
void particle_map(double, double, double, void particle_map(double, double, double,
double, int **, int, int, double, int **, int, int,
int, int, int, int, int, int,
int, int, int, int, int, int,
IntelBuffers<flt_t,acc_t> *buffers); IntelBuffers<flt_t,acc_t> *buffers);
template<class flt_t, class acc_t, int use_table> template<class flt_t, class acc_t, int use_table>
void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers); void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers);
template<class flt_t, class acc_t> template<class flt_t, class acc_t>
void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers) { void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers) {
if (_use_table == 1) { if (_use_table == 1) {
make_rho_c<flt_t,acc_t,1>(buffers); make_rho_c<flt_t,acc_t,1>(buffers);
} else { } else {
make_rho_c<flt_t,acc_t,0>(buffers); make_rho_c<flt_t,acc_t,0>(buffers);
} }
} }
template<class flt_t, class acc_t, int use_table> template<class flt_t, class acc_t, int use_table>
void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers); void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers);
template<class flt_t, class acc_t> template<class flt_t, class acc_t>
void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers) { void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers) {
if (_use_table == 1) { if (_use_table == 1) {
make_rho_g<flt_t,acc_t,1>(buffers); make_rho_g<flt_t,acc_t,1>(buffers);
} else { } else {
make_rho_g<flt_t,acc_t,0>(buffers); make_rho_g<flt_t,acc_t,0>(buffers);
} }
} }
template<class flt_t, class acc_t, int use_table> template<class flt_t, class acc_t, int use_table>
void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers); void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers);
template<class flt_t, class acc_t> template<class flt_t, class acc_t>
void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers) { void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers) {
if (_use_table == 1) { if (_use_table == 1) {
make_rho_a<flt_t,acc_t,1>(buffers); make_rho_a<flt_t,acc_t,1>(buffers);
} else { } else {
make_rho_a<flt_t,acc_t,0>(buffers); make_rho_a<flt_t,acc_t,0>(buffers);
} }
} }
template<class flt_t, class acc_t, int use_table> template<class flt_t, class acc_t, int use_table>
void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers); void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers);
template<class flt_t, class acc_t> template<class flt_t, class acc_t>
void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers) { void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers) {
if (_use_table == 1) { if (_use_table == 1) {
make_rho_none<flt_t,acc_t,1>(buffers); make_rho_none<flt_t,acc_t,1>(buffers);
} else { } else {
make_rho_none<flt_t,acc_t,0>(buffers); make_rho_none<flt_t,acc_t,0>(buffers);
} }
} }
template<class flt_t, class acc_t, int use_table> template<class flt_t, class acc_t, int use_table>
void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers); void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers);
template<class flt_t, class acc_t> template<class flt_t, class acc_t>
void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers) { void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers) {
if (_use_table == 1) { if (_use_table == 1) {
fieldforce_c_ik<flt_t,acc_t,1>(buffers); fieldforce_c_ik<flt_t,acc_t,1>(buffers);
} else { } else {
fieldforce_c_ik<flt_t,acc_t,0>(buffers); fieldforce_c_ik<flt_t,acc_t,0>(buffers);
} }
} }
template<class flt_t, class acc_t, int use_table> template<class flt_t, class acc_t, int use_table>
void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers); void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers);
template<class flt_t, class acc_t> template<class flt_t, class acc_t>
void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers) { void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers) {
if (_use_table == 1) { if (_use_table == 1) {
fieldforce_c_ad<flt_t,acc_t,1>(buffers); fieldforce_c_ad<flt_t,acc_t,1>(buffers);
} else { } else {
fieldforce_c_ad<flt_t,acc_t,0>(buffers); fieldforce_c_ad<flt_t,acc_t,0>(buffers);
} }
} }
template<class flt_t, class acc_t, int use_table> template<class flt_t, class acc_t, int use_table>
void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers); void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers);
template<class flt_t, class acc_t> template<class flt_t, class acc_t>
void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers) { void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers) {
if (_use_table == 1) { if (_use_table == 1) {
fieldforce_g_ik<flt_t,acc_t,1>(buffers); fieldforce_g_ik<flt_t,acc_t,1>(buffers);
} else { } else {
fieldforce_g_ik<flt_t,acc_t,0>(buffers); fieldforce_g_ik<flt_t,acc_t,0>(buffers);
} }
} }
template<class flt_t, class acc_t, int use_table> template<class flt_t, class acc_t, int use_table>
void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers); void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers);
template<class flt_t, class acc_t> template<class flt_t, class acc_t>
void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers) { void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers) {
if (_use_table == 1) { if (_use_table == 1) {
fieldforce_g_ad<flt_t,acc_t,1>(buffers); fieldforce_g_ad<flt_t,acc_t,1>(buffers);
} else { } else {
fieldforce_g_ad<flt_t,acc_t,0>(buffers); fieldforce_g_ad<flt_t,acc_t,0>(buffers);
} }
} }
template<class flt_t, class acc_t, int use_table> template<class flt_t, class acc_t, int use_table>
void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers); void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers);
template<class flt_t, class acc_t> template<class flt_t, class acc_t>
void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers) { void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers) {
if (_use_table == 1) { if (_use_table == 1) {
fieldforce_a_ik<flt_t,acc_t,1>(buffers); fieldforce_a_ik<flt_t,acc_t,1>(buffers);
} else { } else {
fieldforce_a_ik<flt_t,acc_t,0>(buffers); fieldforce_a_ik<flt_t,acc_t,0>(buffers);
} }
} }
template<class flt_t, class acc_t, int use_table> template<class flt_t, class acc_t, int use_table>
void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers); void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers);
template<class flt_t, class acc_t> template<class flt_t, class acc_t>
void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers) { void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers) {
if (_use_table == 1) { if (_use_table == 1) {
fieldforce_a_ad<flt_t,acc_t,1>(buffers); fieldforce_a_ad<flt_t,acc_t,1>(buffers);
} else { } else {
fieldforce_a_ad<flt_t,acc_t,0>(buffers); fieldforce_a_ad<flt_t,acc_t,0>(buffers);
} }
} }
template<class flt_t, class acc_t, int use_table> template<class flt_t, class acc_t, int use_table>
void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers); void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers);
template<class flt_t, class acc_t> template<class flt_t, class acc_t>
void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers) { void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers) {
if (_use_table == 1) { if (_use_table == 1) {
fieldforce_none_ik<flt_t,acc_t,1>(buffers); fieldforce_none_ik<flt_t,acc_t,1>(buffers);
} else { } else {
fieldforce_none_ik<flt_t,acc_t,0>(buffers); fieldforce_none_ik<flt_t,acc_t,0>(buffers);
} }
} }
template<class flt_t, class acc_t, int use_table> template<class flt_t, class acc_t, int use_table>
void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers); void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers);
template<class flt_t, class acc_t> template<class flt_t, class acc_t>
void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers) { void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers) {
if (_use_table == 1) { if (_use_table == 1) {
fieldforce_none_ad<flt_t,acc_t,1>(buffers); fieldforce_none_ad<flt_t,acc_t,1>(buffers);
} else { } else {
fieldforce_none_ad<flt_t,acc_t,0>(buffers); fieldforce_none_ad<flt_t,acc_t,0>(buffers);
} }
} }
void precompute_rho(); void precompute_rho();
}; };
} }
#endif #endif
#endif #endif

View File

@ -14,7 +14,7 @@
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
Contributing authors: William McDoniel (RWTH Aachen University) Contributing authors: William McDoniel (RWTH Aachen University)
Rodrigo Canales (RWTH Aachen University) Rodrigo Canales (RWTH Aachen University)
Markus Hoehnerbach (RWTH Aachen University) Markus Hoehnerbach (RWTH Aachen University)
W. Michael Brown (Intel) W. Michael Brown (Intel)
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
@ -62,10 +62,10 @@ PPPMIntel::PPPMIntel(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg)
perthread_density = NULL; perthread_density = NULL;
particle_ekx = particle_eky = particle_ekz = NULL; particle_ekx = particle_eky = particle_ekz = NULL;
rho_lookup = drho_lookup = NULL; rho_lookup = drho_lookup = NULL;
rho_points = 0; rho_points = 0;
vdxy_brick = vdz0_brick = NULL; vdxy_brick = vdz0_brick = NULL;
work3 = NULL; work3 = NULL;
cg_pack = NULL; cg_pack = NULL;
@ -120,20 +120,20 @@ void PPPMIntel::init()
if ((comm->nthreads > 1) && !_use_lrt) { if ((comm->nthreads > 1) && !_use_lrt) {
memory->destroy(perthread_density); memory->destroy(perthread_density);
memory->create(perthread_density, comm->nthreads-1, memory->create(perthread_density, comm->nthreads-1,
ngrid + INTEL_P3M_ALIGNED_MAXORDER, ngrid + INTEL_P3M_ALIGNED_MAXORDER,
"pppmintel:perthread_density"); "pppmintel:perthread_density");
} }
_use_table = fix->pppm_table(); _use_table = fix->pppm_table();
if (_use_table) { if (_use_table) {
rho_points = 5000; rho_points = 5000;
memory->destroy(rho_lookup); memory->destroy(rho_lookup);
memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
"pppmintel:rho_lookup"); "pppmintel:rho_lookup");
if(differentiation_flag == 1) { if(differentiation_flag == 1) {
memory->destroy(drho_lookup); memory->destroy(drho_lookup);
memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
"pppmintel:drho_lookup"); "pppmintel:drho_lookup");
} }
precompute_rho(); precompute_rho();
} }
@ -141,7 +141,7 @@ void PPPMIntel::init()
if (order > INTEL_P3M_MAXORDER) if (order > INTEL_P3M_MAXORDER)
error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n"); error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n");
_use_packing = (order == 7) && (INTEL_VECTOR_WIDTH == 16) _use_packing = (order == 7) && (INTEL_VECTOR_WIDTH == 16)
&& (sizeof(FFT_SCALAR) == sizeof(float)) && (sizeof(FFT_SCALAR) == sizeof(float))
&& (differentiation_flag == 0); && (differentiation_flag == 0);
if (_use_packing) { if (_use_packing) {
@ -149,13 +149,13 @@ void PPPMIntel::init()
memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out); memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out); memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out); memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2, memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1, nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
"pppmintel:vdxy_brick"); "pppmintel:vdxy_brick");
memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out); memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2, memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1, nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
"pppmintel:vdz0_brick"); "pppmintel:vdz0_brick");
memory->destroy(work3); memory->destroy(work3);
memory->create(work3, 2*nfft_both, "pppmintel:work3"); memory->create(work3, 2*nfft_both, "pppmintel:work3");
@ -163,10 +163,10 @@ void PPPMIntel::init()
delete cg_pack; delete cg_pack;
int (*procneigh)[2] = comm->procneigh; int (*procneigh)[2] = comm->procneigh;
cg_pack = new GridComm(lmp,world,2,0, 2*nxlo_in,2*nxhi_in+1,nylo_in, cg_pack = new GridComm(lmp,world,2,0, 2*nxlo_in,2*nxhi_in+1,nylo_in,
nyhi_in,nzlo_in,nzhi_in, 2*nxlo_out,2*nxhi_out+1, nyhi_in,nzlo_in,nzhi_in, 2*nxlo_out,2*nxhi_out+1,
nylo_out,nyhi_out,nzlo_out,nzhi_out, nylo_out,nyhi_out,nzlo_out,nzhi_out,
procneigh[0][0],procneigh[0][1],procneigh[1][0], procneigh[0][0],procneigh[0][1],procneigh[1][0],
procneigh[1][1],procneigh[2][0],procneigh[2][1]); procneigh[1][1],procneigh[2][0],procneigh[2][1]);
cg_pack->ghost_notify(); cg_pack->ghost_notify();
cg_pack->setup(); cg_pack->setup();
@ -484,7 +484,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
{ {
const int nix = nxhi_out - nxlo_out + 1; const int nix = nxhi_out - nxlo_out + 1;
const int niy = nyhi_out - nylo_out + 1; const int niy = nyhi_out - nylo_out + 1;
const flt_t lo0 = boxlo[0]; const flt_t lo0 = boxlo[0];
const flt_t lo1 = boxlo[1]; const flt_t lo1 = boxlo[1];
const flt_t lo2 = boxlo[2]; const flt_t lo2 = boxlo[2];
@ -503,7 +503,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
memset(my_density, 0, ngrid * sizeof(FFT_SCALAR)); memset(my_density, 0, ngrid * sizeof(FFT_SCALAR));
for (int i = ifrom; i < ito; i++) { for (int i = ifrom; i < ito; i++) {
int nx = part2grid[i][0]; int nx = part2grid[i][0];
int ny = part2grid[i][1]; int ny = part2grid[i][1];
int nz = part2grid[i][2]; int nz = part2grid[i][2];
@ -515,9 +515,9 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi; FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi; FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi; FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
if (use_table) { if (use_table) {
dx = dx*half_rho_scale + half_rho_scale_plus; dx = dx*half_rho_scale + half_rho_scale_plus;
int idx = dx; int idx = dx;
@ -527,7 +527,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
int idz = dz; int idz = dz;
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma simd #pragma simd
#endif #endif
for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho[0][k] = rho_lookup[idx][k]; rho[0][k] = rho_lookup[idx][k];
rho[1][k] = rho_lookup[idy][k]; rho[1][k] = rho_lookup[idy][k];
@ -536,11 +536,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
} else { } else {
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma simd #pragma simd
#endif #endif
for (int k = nlower; k <= nupper; k++) { for (int k = nlower; k <= nupper; k++) {
FFT_SCALAR r1,r2,r3; FFT_SCALAR r1,r2,r3;
r1 = r2 = r3 = ZEROF; r1 = r2 = r3 = ZEROF;
for (int l = order-1; l >= 0; l--) { for (int l = order-1; l >= 0; l--) {
r1 = rho_coeff[l][k] + r1*dx; r1 = rho_coeff[l][k] + r1*dx;
r2 = rho_coeff[l][k] + r2*dy; r2 = rho_coeff[l][k] + r2*dy;
@ -551,24 +551,24 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
rho[2][k-nlower] = r3; rho[2][k-nlower] = r3;
} }
} }
FFT_SCALAR z0 = fdelvolinv * q[i]; FFT_SCALAR z0 = fdelvolinv * q[i];
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7 #pragma loop_count=7
#endif #endif
for (int n = 0; n < order; n++) { for (int n = 0; n < order; n++) {
int mz = n*nix*niy + nzsum; int mz = n*nix*niy + nzsum;
FFT_SCALAR y0 = z0*rho[2][n]; FFT_SCALAR y0 = z0*rho[2][n];
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7 #pragma loop_count=7
#endif #endif
for (int m = 0; m < order; m++) { for (int m = 0; m < order; m++) {
int mzy = m*nix + mz; int mzy = m*nix + mz;
FFT_SCALAR x0 = y0*rho[1][m]; FFT_SCALAR x0 = y0*rho[1][m];
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma simd #pragma simd
#endif #endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
int mzyx = l + mzy; int mzyx = l + mzy;
my_density[mzyx] += x0*rho[0][l]; my_density[mzyx] += x0*rho[0][l];
@ -709,21 +709,21 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7 #pragma loop_count=7
#endif #endif
for (int n = 0; n < order; n++) { for (int n = 0; n < order; n++) {
int mz = n+nzsum; int mz = n+nzsum;
FFT_SCALAR z0 = rho2[n]; FFT_SCALAR z0 = rho2[n];
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7 #pragma loop_count=7
#endif #endif
for (int m = 0; m < order; m++) { for (int m = 0; m < order; m++) {
int my = m+nysum; int my = m+nysum;
FFT_SCALAR y0 = z0*rho1[m]; FFT_SCALAR y0 = z0*rho1[m];
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma simd #pragma simd
#endif #endif
for (int l = 0; l < (use_packing ? 2 : 1) * for (int l = 0; l < (use_packing ? 2 : 1) *
INTEL_P3M_ALIGNED_MAXORDER; l++) { INTEL_P3M_ALIGNED_MAXORDER; l++) {
int mx = l+nxsum; int mx = l+nxsum;
FFT_SCALAR x0 = y0*rho0[l]; FFT_SCALAR x0 = y0*rho0[l];
if (use_packing) { if (use_packing) {
@ -824,13 +824,13 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
const flt_t fsf_coeff3 = sf_coeff[3]; const flt_t fsf_coeff3 = sf_coeff[3];
const flt_t fsf_coeff4 = sf_coeff[4]; const flt_t fsf_coeff4 = sf_coeff[4];
const flt_t fsf_coeff5 = sf_coeff[5]; const flt_t fsf_coeff5 = sf_coeff[5];
int ifrom, ito, tid; int ifrom, ito, tid;
IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
_alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
for (int i = ifrom; i < ito; i++) { for (int i = ifrom; i < ito; i++) {
int nx = part2grid[i][0]; int nx = part2grid[i][0];
int ny = part2grid[i][1]; int ny = part2grid[i][1];
@ -838,11 +838,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi; FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi; FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi; FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
int nxsum = nx + nlower; int nxsum = nx + nlower;
int nysum = ny + nlower; int nysum = ny + nlower;
int nzsum = nz + nlower; int nzsum = nz + nlower;
if (use_table) { if (use_table) {
dx = dx*half_rho_scale + half_rho_scale_plus; dx = dx*half_rho_scale + half_rho_scale_plus;
int idx = dx; int idx = dx;
@ -852,7 +852,7 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
int idz = dz; int idz = dz;
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma simd #pragma simd
#endif #endif
for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
rho[0][k] = rho_lookup[idx][k]; rho[0][k] = rho_lookup[idx][k];
rho[1][k] = rho_lookup[idy][k]; rho[1][k] = rho_lookup[idy][k];
@ -864,11 +864,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
} else { } else {
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma simd #pragma simd
#endif #endif
for (int k = nlower; k <= nupper; k++) { for (int k = nlower; k <= nupper; k++) {
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
dr1 = dr2 = dr3 = ZEROF; dr1 = dr2 = dr3 = ZEROF;
r1 = rho_coeff[order-1][k]; r1 = rho_coeff[order-1][k];
r2 = rho_coeff[order-1][k]; r2 = rho_coeff[order-1][k];
r3 = rho_coeff[order-1][k]; r3 = rho_coeff[order-1][k];
@ -888,21 +888,21 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
drho[2][k-nlower] = dr3; drho[2][k-nlower] = dr3;
} }
} }
_alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
_alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
_alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF; particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7 #pragma loop_count=7
#endif #endif
for (int n = 0; n < order; n++) { for (int n = 0; n < order; n++) {
int mz = n + nzsum; int mz = n + nzsum;
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7 #pragma loop_count=7
#endif #endif
for (int m = 0; m < order; m++) { for (int m = 0; m < order; m++) {
int my = m + nysum; int my = m + nysum;
FFT_SCALAR ekx_p = rho[1][m] * rho[2][n]; FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
@ -910,7 +910,7 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma simd #pragma simd
#endif #endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
int mx = l + nxsum; int mx = l + nxsum;
ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx]; ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
@ -919,17 +919,17 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
} }
} }
} }
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma simd #pragma simd
#endif #endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
particle_ekx[i] += ekx[l]; particle_ekx[i] += ekx[l];
particle_eky[i] += eky[l]; particle_eky[i] += eky[l];
particle_ekz[i] += ekz[l]; particle_ekz[i] += ekz[l];
} }
} }
#if defined(LMP_SIMD_COMPILER) #if defined(LMP_SIMD_COMPILER)
#pragma simd #pragma simd
#endif #endif
@ -937,12 +937,12 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
particle_ekx[i] *= hx_inv; particle_ekx[i] *= hx_inv;
particle_eky[i] *= hy_inv; particle_eky[i] *= hy_inv;
particle_ekz[i] *= hz_inv; particle_ekz[i] *= hz_inv;
// convert E-field to force // convert E-field to force
const flt_t qfactor = fqqrd2es * q[i]; const flt_t qfactor = fqqrd2es * q[i];
const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i]; const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i];
const flt_t s1 = x[i].x * hx_inv; const flt_t s1 = x[i].x * hx_inv;
const flt_t s2 = x[i].y * hy_inv; const flt_t s2 = x[i].y * hy_inv;
const flt_t s3 = x[i].z * hz_inv; const flt_t s3 = x[i].z * hz_inv;
@ -950,16 +950,16 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
sf += fsf_coeff1 * sin(ffour_pi * s1); sf += fsf_coeff1 * sin(ffour_pi * s1);
sf *= twoqsq; sf *= twoqsq;
f[i].x += qfactor * particle_ekx[i] - fqqrd2es * sf; f[i].x += qfactor * particle_ekx[i] - fqqrd2es * sf;
sf = fsf_coeff2 * sin(ftwo_pi * s2); sf = fsf_coeff2 * sin(ftwo_pi * s2);
sf += fsf_coeff3 * sin(ffour_pi * s2); sf += fsf_coeff3 * sin(ffour_pi * s2);
sf *= twoqsq; sf *= twoqsq;
f[i].y += qfactor * particle_eky[i] - fqqrd2es * sf; f[i].y += qfactor * particle_eky[i] - fqqrd2es * sf;
sf = fsf_coeff4 * sin(ftwo_pi * s3); sf = fsf_coeff4 * sin(ftwo_pi * s3);
sf += fsf_coeff5 * sin(ffour_pi * s3); sf += fsf_coeff5 * sin(ffour_pi * s3);
sf *= twoqsq; sf *= twoqsq;
if (slabflag != 2) f[i].z += qfactor * particle_ekz[i] - fqqrd2es * sf; if (slabflag != 2) f[i].z += qfactor * particle_ekz[i] - fqqrd2es * sf;
} }
} }
@ -1000,7 +1000,7 @@ void PPPMIntel::poisson_ik_intel()
n = 0; n = 0;
for (i = 0; i < nfft; i++) { for (i = 0; i < nfft; i++) {
eng = s2 * greensfn[i] * (work1[n]*work1[n] + eng = s2 * greensfn[i] * (work1[n]*work1[n] +
work1[n+1]*work1[n+1]); work1[n+1]*work1[n+1]);
for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j]; for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
if (eflag_global) energy += eng; if (eflag_global) energy += eng;
n += 2; n += 2;
@ -1069,10 +1069,10 @@ void PPPMIntel::poisson_ik_intel()
for (j = nylo_in; j <= nyhi_in; j++) for (j = nylo_in; j <= nyhi_in; j++)
for (i = nxlo_in; i <= nxhi_in; i++) { for (i = nxlo_in; i <= nxhi_in; i++) {
vdxy_brick[k][j][2*i] = work2[n]; vdxy_brick[k][j][2*i] = work2[n];
vdxy_brick[k][j][2*i+1] = work3[n]; vdxy_brick[k][j][2*i+1] = work3[n];
n += 2; n += 2;
} }
// z direction gradient // z direction gradient
n = 0; n = 0;
@ -1091,7 +1091,7 @@ void PPPMIntel::poisson_ik_intel()
for (j = nylo_in; j <= nyhi_in; j++) for (j = nylo_in; j <= nyhi_in; j++)
for (i = nxlo_in; i <= nxhi_in; i++) { for (i = nxlo_in; i <= nxhi_in; i++) {
vdz0_brick[k][j][2*i] = work2[n]; vdz0_brick[k][j][2*i] = work2[n];
vdz0_brick[k][j][2*i+1] = 0.; vdz0_brick[k][j][2*i+1] = 0.;
n += 2; n += 2;
} }
} }
@ -1202,7 +1202,7 @@ double PPPMIntel::memory_usage()
} }
} }
if (_use_packing) { if (_use_packing) {
bytes += 2 * (nzhi_out + 2 - nzlo_out + 1) * (nyhi_out - nylo_out + 1) bytes += 2 * (nzhi_out + 2 - nzlo_out + 1) * (nyhi_out - nylo_out + 1)
* (2 * nxhi_out + 1 - 2 * nxlo_out + 1) * sizeof(FFT_SCALAR); * (2 * nxhi_out + 1 - 2 * nxlo_out + 1) * sizeof(FFT_SCALAR);
bytes -= 3 * (nxhi_out - nxlo_out + 1) * (nyhi_out - nylo_out + 1) bytes -= 3 * (nxhi_out - nxlo_out + 1) * (nyhi_out - nylo_out + 1)
* (nzhi_out - nzlo_out + 1) * sizeof(FFT_SCALAR); * (nzhi_out - nzlo_out + 1) * sizeof(FFT_SCALAR);
@ -1228,7 +1228,7 @@ void PPPMIntel::pack_buffers()
{ {
int ifrom, ito, tid; int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost, IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
packthreads, packthreads,
sizeof(IntelBuffers<float,double>::atom_t)); sizeof(IntelBuffers<float,double>::atom_t));
if (fix->precision() == FixIntel::PREC_MODE_MIXED) if (fix->precision() == FixIntel::PREC_MODE_MIXED)
fix->get_mixed_buffers()->thr_pack(ifrom,ito,1); fix->get_mixed_buffers()->thr_pack(ifrom,ito,1);

View File

@ -14,7 +14,7 @@
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
Contributing authors: William McDoniel (RWTH Aachen University) Contributing authors: William McDoniel (RWTH Aachen University)
Rodrigo Canales (RWTH Aachen University) Rodrigo Canales (RWTH Aachen University)
Markus Hoehnerbach (RWTH Aachen University) Markus Hoehnerbach (RWTH Aachen University)
W. Michael Brown (Intel) W. Michael Brown (Intel)
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
@ -77,7 +77,7 @@ class PPPMIntel : public PPPM {
template<class flt_t, class acc_t> template<class flt_t, class acc_t>
void test_function(IntelBuffers<flt_t,acc_t> *buffers); void test_function(IntelBuffers<flt_t,acc_t> *buffers);
void precompute_rho(); void precompute_rho();
template<class flt_t, class acc_t> template<class flt_t, class acc_t>
void particle_map(IntelBuffers<flt_t,acc_t> *buffers); void particle_map(IntelBuffers<flt_t,acc_t> *buffers);

View File

@ -51,7 +51,7 @@ VerletLRTIntel::VerletLRTIntel(LAMMPS *lmp, int narg, char **arg) :
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
VerletLRTIntel::~VerletLRTIntel() VerletLRTIntel::~VerletLRTIntel()
{ {
#if defined(_LMP_INTEL_LRT_PTHREAD) #if defined(_LMP_INTEL_LRT_PTHREAD)
pthread_mutex_destroy(&_kmutex); pthread_mutex_destroy(&_kmutex);
@ -67,10 +67,10 @@ void VerletLRTIntel::init()
Verlet::init(); Verlet::init();
_intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0)); _intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0));
#ifdef LMP_INTEL_NOLRT #ifdef LMP_INTEL_NOLRT
error->all(FLERR, error->all(FLERR,
"LRT otion for Intel package disabled at compile time"); "LRT otion for Intel package disabled at compile time");
#endif #endif
} }
@ -83,7 +83,7 @@ void VerletLRTIntel::setup(int flag)
if (_intel_kspace == 0) { if (_intel_kspace == 0) {
Verlet::setup(flag); Verlet::setup(flag);
return; return;
} }
#ifdef _LMP_INTEL_OFFLOAD #ifdef _LMP_INTEL_OFFLOAD
if (_intel_kspace->use_base()) { if (_intel_kspace->use_base()) {
@ -154,15 +154,15 @@ void VerletLRTIntel::setup(int flag)
_intel_kspace->setup(); _intel_kspace->setup();
#if defined(_LMP_INTEL_LRT_PTHREAD) #if defined(_LMP_INTEL_LRT_PTHREAD)
pthread_create(&_kspace_thread, &_kspace_attr, pthread_create(&_kspace_thread, &_kspace_attr,
&VerletLRTIntel::k_launch_loop, this); &VerletLRTIntel::k_launch_loop, this);
#elif defined(_LMP_INTEL_LRT_11) #elif defined(_LMP_INTEL_LRT_11)
std::thread kspace_thread; std::thread kspace_thread;
if (kspace_compute_flag) if (kspace_compute_flag)
_kspace_thread=std::thread([=]{ _intel_kspace->compute_first(eflag, _kspace_thread=std::thread([=]{ _intel_kspace->compute_first(eflag,
vflag); }); vflag); });
else else
_kspace_thread=std::thread([=]{ _intel_kspace->compute_dummy(eflag, _kspace_thread=std::thread([=]{ _intel_kspace->compute_dummy(eflag,
vflag); }); vflag); });
#endif #endif
@ -297,8 +297,8 @@ void VerletLRTIntel::run(int n)
pthread_mutex_unlock(&_kmutex); pthread_mutex_unlock(&_kmutex);
#elif defined(_LMP_INTEL_LRT_11) #elif defined(_LMP_INTEL_LRT_11)
std::thread kspace_thread; std::thread kspace_thread;
if (kspace_compute_flag) if (kspace_compute_flag)
kspace_thread=std::thread([=] { kspace_thread=std::thread([=] {
_intel_kspace->compute_first(eflag, vflag); _intel_kspace->compute_first(eflag, vflag);
timer->stamp(Timer::KSPACE); timer->stamp(Timer::KSPACE);
} ); } );
@ -329,7 +329,7 @@ void VerletLRTIntel::run(int n)
_kspace_done = 0; _kspace_done = 0;
pthread_mutex_unlock(&_kmutex); pthread_mutex_unlock(&_kmutex);
#elif defined(_LMP_INTEL_LRT_11) #elif defined(_LMP_INTEL_LRT_11)
if (kspace_compute_flag) if (kspace_compute_flag)
kspace_thread.join(); kspace_thread.join();
#endif #endif
@ -367,7 +367,7 @@ void VerletLRTIntel::run(int n)
} }
#if defined(_LMP_INTEL_LRT_PTHREAD) #if defined(_LMP_INTEL_LRT_PTHREAD)
if (run_cancelled) if (run_cancelled)
pthread_cancel(_kspace_thread); pthread_cancel(_kspace_thread);
else { else {
pthread_mutex_lock(&_kmutex); pthread_mutex_lock(&_kmutex);
@ -390,9 +390,9 @@ void * VerletLRTIntel::k_launch_loop(void *context)
{ {
VerletLRTIntel * const c = (VerletLRTIntel *)context; VerletLRTIntel * const c = (VerletLRTIntel *)context;
if (c->kspace_compute_flag) if (c->kspace_compute_flag)
c->_intel_kspace->compute_first(c->eflag, c->vflag); c->_intel_kspace->compute_first(c->eflag, c->vflag);
else else
c->_intel_kspace->compute_dummy(c->eflag, c->vflag); c->_intel_kspace->compute_dummy(c->eflag, c->vflag);
pthread_mutex_lock(&(c->_kmutex)); pthread_mutex_lock(&(c->_kmutex));
@ -408,7 +408,7 @@ void * VerletLRTIntel::k_launch_loop(void *context)
pthread_mutex_unlock(&(c->_kmutex)); pthread_mutex_unlock(&(c->_kmutex));
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
if (c->kspace_compute_flag) { if (c->kspace_compute_flag) {
c->_intel_kspace->compute_first(c->eflag, c->vflag); c->_intel_kspace->compute_first(c->eflag, c->vflag);
c->timer->stamp(Timer::KSPACE); c->timer->stamp(Timer::KSPACE);