massive whitespace cleanup in USER-INTEL
removed are: - DOS/Windows text format carriage return characters (^M) - tabs replaced with spaces (tabs are evil!!) - trailing whitespace
This commit is contained in:
@ -37,7 +37,7 @@ typedef struct { int a,b,c,t; } int4_t;
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
AngleCharmmIntel::AngleCharmmIntel(LAMMPS *lmp) : AngleCharmm(lmp)
|
AngleCharmmIntel::AngleCharmmIntel(LAMMPS *lmp) : AngleCharmm(lmp)
|
||||||
{
|
{
|
||||||
suffix_flag |= Suffix::INTEL;
|
suffix_flag |= Suffix::INTEL;
|
||||||
}
|
}
|
||||||
@ -74,8 +74,8 @@ void AngleCharmmIntel::compute(int eflag, int vflag)
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void AngleCharmmIntel::compute(int eflag, int vflag,
|
void AngleCharmmIntel::compute(int eflag, int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
if (eflag || vflag) ev_setup(eflag,vflag);
|
if (eflag || vflag) ev_setup(eflag,vflag);
|
||||||
else evflag = 0;
|
else evflag = 0;
|
||||||
@ -83,14 +83,14 @@ void AngleCharmmIntel::compute(int eflag, int vflag,
|
|||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (vflag && !eflag) {
|
if (vflag && !eflag) {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<0,1,1>(vflag, buffers, fc);
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<0,1,0>(vflag, buffers, fc);
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -103,9 +103,9 @@ void AngleCharmmIntel::compute(int eflag, int vflag,
|
|||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void AngleCharmmIntel::eval(const int vflag,
|
void AngleCharmmIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
|
|
||||||
{
|
{
|
||||||
const int inum = neighbor->nanglelist;
|
const int inum = neighbor->nanglelist;
|
||||||
@ -133,7 +133,7 @@ void AngleCharmmIntel::eval(const int vflag,
|
|||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel default(none) \
|
||||||
shared(f_start,f_stride,fc) \
|
shared(f_start,f_stride,fc) \
|
||||||
reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
|
reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
@ -148,7 +148,7 @@ void AngleCharmmIntel::eval(const int vflag,
|
|||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
memset(f, 0, f_stride * sizeof(FORCE_T));
|
memset(f, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
const int4_t * _noalias const anglelist =
|
const int4_t * _noalias const anglelist =
|
||||||
(int4_t *) neighbor->anglelist[0];
|
(int4_t *) neighbor->anglelist[0];
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
@ -246,35 +246,35 @@ void AngleCharmmIntel::eval(const int vflag,
|
|||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
f[i1].x += f1x;
|
f[i1].x += f1x;
|
||||||
f[i1].y += f1y;
|
f[i1].y += f1y;
|
||||||
f[i1].z += f1z;
|
f[i1].z += f1z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i2 < nlocal) {
|
if (NEWTON_BOND || i2 < nlocal) {
|
||||||
f[i2].x -= f1x + f3x;
|
f[i2].x -= f1x + f3x;
|
||||||
f[i2].y -= f1y + f3y;
|
f[i2].y -= f1y + f3y;
|
||||||
f[i2].z -= f1z + f3z;
|
f[i2].z -= f1z + f3z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i3 < nlocal) {
|
if (NEWTON_BOND || i3 < nlocal) {
|
||||||
f[i3].x += f3x;
|
f[i3].x += f3x;
|
||||||
f[i3].y += f3y;
|
f[i3].y += f3y;
|
||||||
f[i3].z += f3z;
|
f[i3].z += f3z;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (EFLAG || VFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2,
|
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2,
|
||||||
i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1,
|
i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1,
|
||||||
dely1, delz1, delx2, dely2, delz2, seangle,
|
dely1, delz1, delx2, dely2, delz2, seangle,
|
||||||
f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3,
|
f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3,
|
||||||
sv4, sv5);
|
sv4, sv5);
|
||||||
#else
|
#else
|
||||||
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2,
|
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2,
|
||||||
i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1,
|
i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1,
|
||||||
dely1, delz1, delx2, dely2, delz2, oeangle,
|
dely1, delz1, delx2, dely2, delz2, oeangle,
|
||||||
f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3,
|
f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3,
|
||||||
ov4, ov5);
|
ov4, ov5);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -282,8 +282,8 @@ void AngleCharmmIntel::eval(const int vflag,
|
|||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
if (EFLAG) oeangle += seangle;
|
if (EFLAG) oeangle += seangle;
|
||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||||
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // omp parallel
|
} // omp parallel
|
||||||
@ -291,7 +291,7 @@ void AngleCharmmIntel::eval(const int vflag,
|
|||||||
if (EFLAG) energy += oeangle;
|
if (EFLAG) energy += oeangle;
|
||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->set_reduce_flag();
|
fix->set_reduce_flag();
|
||||||
@ -348,11 +348,11 @@ void AngleCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
|
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
void AngleCharmmIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes,
|
void AngleCharmmIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes,
|
||||||
Memory *memory) {
|
Memory *memory) {
|
||||||
if (nangletypes != _nangletypes) {
|
if (nangletypes != _nangletypes) {
|
||||||
if (_nangletypes > 0)
|
if (_nangletypes > 0)
|
||||||
_memory->destroy(fc);
|
_memory->destroy(fc);
|
||||||
|
|
||||||
if (nangletypes > 0)
|
if (nangletypes > 0)
|
||||||
_memory->create(fc,nangletypes,"anglecharmmintel.fc");
|
_memory->create(fc,nangletypes,"anglecharmmintel.fc");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -45,8 +45,8 @@ class AngleCharmmIntel : public AngleCharmm {
|
|||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t, acc_t> *buffers);
|
IntelBuffers<flt_t, acc_t> *buffers);
|
||||||
|
|||||||
@ -37,7 +37,7 @@ typedef struct { int a,b,c,t; } int4_t;
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
AngleHarmonicIntel::AngleHarmonicIntel(LAMMPS *lmp) : AngleHarmonic(lmp)
|
AngleHarmonicIntel::AngleHarmonicIntel(LAMMPS *lmp) : AngleHarmonic(lmp)
|
||||||
{
|
{
|
||||||
suffix_flag |= Suffix::INTEL;
|
suffix_flag |= Suffix::INTEL;
|
||||||
}
|
}
|
||||||
@ -74,8 +74,8 @@ void AngleHarmonicIntel::compute(int eflag, int vflag)
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void AngleHarmonicIntel::compute(int eflag, int vflag,
|
void AngleHarmonicIntel::compute(int eflag, int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
if (eflag || vflag) ev_setup(eflag,vflag);
|
if (eflag || vflag) ev_setup(eflag,vflag);
|
||||||
else evflag = 0;
|
else evflag = 0;
|
||||||
@ -83,14 +83,14 @@ void AngleHarmonicIntel::compute(int eflag, int vflag,
|
|||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (vflag && !eflag) {
|
if (vflag && !eflag) {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<0,1,1>(vflag, buffers, fc);
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<0,1,0>(vflag, buffers, fc);
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -103,9 +103,9 @@ void AngleHarmonicIntel::compute(int eflag, int vflag,
|
|||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void AngleHarmonicIntel::eval(const int vflag,
|
void AngleHarmonicIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
|
|
||||||
{
|
{
|
||||||
const int inum = neighbor->nanglelist;
|
const int inum = neighbor->nanglelist;
|
||||||
@ -133,7 +133,7 @@ void AngleHarmonicIntel::eval(const int vflag,
|
|||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel default(none) \
|
||||||
shared(f_start,f_stride,fc) \
|
shared(f_start,f_stride,fc) \
|
||||||
reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
|
reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
@ -148,7 +148,7 @@ void AngleHarmonicIntel::eval(const int vflag,
|
|||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
memset(f, 0, f_stride * sizeof(FORCE_T));
|
memset(f, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
const int4_t * _noalias const anglelist =
|
const int4_t * _noalias const anglelist =
|
||||||
(int4_t *) neighbor->anglelist[0];
|
(int4_t *) neighbor->anglelist[0];
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
@ -228,35 +228,35 @@ void AngleHarmonicIntel::eval(const int vflag,
|
|||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
f[i1].x += f1x;
|
f[i1].x += f1x;
|
||||||
f[i1].y += f1y;
|
f[i1].y += f1y;
|
||||||
f[i1].z += f1z;
|
f[i1].z += f1z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i2 < nlocal) {
|
if (NEWTON_BOND || i2 < nlocal) {
|
||||||
f[i2].x -= f1x + f3x;
|
f[i2].x -= f1x + f3x;
|
||||||
f[i2].y -= f1y + f3y;
|
f[i2].y -= f1y + f3y;
|
||||||
f[i2].z -= f1z + f3z;
|
f[i2].z -= f1z + f3z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i3 < nlocal) {
|
if (NEWTON_BOND || i3 < nlocal) {
|
||||||
f[i3].x += f3x;
|
f[i3].x += f3x;
|
||||||
f[i3].y += f3y;
|
f[i3].y += f3y;
|
||||||
f[i3].z += f3z;
|
f[i3].z += f3z;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (EFLAG || VFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
|
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
|
||||||
f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1,
|
f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1,
|
||||||
delz1, delx2, dely2, delz2, seangle, f,
|
delz1, delx2, dely2, delz2, seangle, f,
|
||||||
NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4,
|
NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4,
|
||||||
sv5);
|
sv5);
|
||||||
#else
|
#else
|
||||||
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
|
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
|
||||||
f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1,
|
f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1,
|
||||||
delz1, delx2, dely2, delz2, oeangle, f,
|
delz1, delx2, dely2, delz2, oeangle, f,
|
||||||
NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4,
|
NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4,
|
||||||
ov5);
|
ov5);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -264,8 +264,8 @@ void AngleHarmonicIntel::eval(const int vflag,
|
|||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
if (EFLAG) oeangle += seangle;
|
if (EFLAG) oeangle += seangle;
|
||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||||
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // omp parallel
|
} // omp parallel
|
||||||
@ -273,7 +273,7 @@ void AngleHarmonicIntel::eval(const int vflag,
|
|||||||
if (EFLAG) energy += oeangle;
|
if (EFLAG) energy += oeangle;
|
||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->set_reduce_flag();
|
fix->set_reduce_flag();
|
||||||
@ -328,11 +328,11 @@ void AngleHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
|
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
void AngleHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes,
|
void AngleHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes,
|
||||||
Memory *memory) {
|
Memory *memory) {
|
||||||
if (nangletypes != _nangletypes) {
|
if (nangletypes != _nangletypes) {
|
||||||
if (_nangletypes > 0)
|
if (_nangletypes > 0)
|
||||||
_memory->destroy(fc);
|
_memory->destroy(fc);
|
||||||
|
|
||||||
if (nangletypes > 0)
|
if (nangletypes > 0)
|
||||||
_memory->create(fc,nangletypes,"anglecharmmintel.fc");
|
_memory->create(fc,nangletypes,"anglecharmmintel.fc");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -45,8 +45,8 @@ class AngleHarmonicIntel : public AngleHarmonic {
|
|||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t, acc_t> *buffers);
|
IntelBuffers<flt_t, acc_t> *buffers);
|
||||||
|
|||||||
@ -33,7 +33,7 @@ typedef struct { int a,b,t; } int3_t;
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
BondFENEIntel::BondFENEIntel(LAMMPS *lmp) : BondFENE(lmp)
|
BondFENEIntel::BondFENEIntel(LAMMPS *lmp) : BondFENE(lmp)
|
||||||
{
|
{
|
||||||
suffix_flag |= Suffix::INTEL;
|
suffix_flag |= Suffix::INTEL;
|
||||||
}
|
}
|
||||||
@ -70,8 +70,8 @@ void BondFENEIntel::compute(int eflag, int vflag)
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void BondFENEIntel::compute(int eflag, int vflag,
|
void BondFENEIntel::compute(int eflag, int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
if (eflag || vflag) ev_setup(eflag,vflag);
|
if (eflag || vflag) ev_setup(eflag,vflag);
|
||||||
else evflag = 0;
|
else evflag = 0;
|
||||||
@ -79,14 +79,14 @@ void BondFENEIntel::compute(int eflag, int vflag,
|
|||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (vflag && !eflag) {
|
if (vflag && !eflag) {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<0,1,1>(vflag, buffers, fc);
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<0,1,0>(vflag, buffers, fc);
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -97,9 +97,9 @@ void BondFENEIntel::compute(int eflag, int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void BondFENEIntel::eval(const int vflag,
|
void BondFENEIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
const int inum = neighbor->nbondlist;
|
const int inum = neighbor->nbondlist;
|
||||||
if (inum == 0) return;
|
if (inum == 0) return;
|
||||||
@ -126,7 +126,7 @@ void BondFENEIntel::eval(const int vflag,
|
|||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel default(none) \
|
||||||
shared(f_start,f_stride,fc) \
|
shared(f_start,f_stride,fc) \
|
||||||
reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
|
reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
@ -141,7 +141,7 @@ void BondFENEIntel::eval(const int vflag,
|
|||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
memset(f, 0, f_stride * sizeof(FORCE_T));
|
memset(f, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
const int3_t * _noalias const bondlist =
|
const int3_t * _noalias const bondlist =
|
||||||
(int3_t *) neighbor->bondlist[0];
|
(int3_t *) neighbor->bondlist[0];
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
@ -176,7 +176,7 @@ void BondFENEIntel::eval(const int vflag,
|
|||||||
// if r -> r0, then rlogarg < 0.0 which is an error
|
// if r -> r0, then rlogarg < 0.0 which is an error
|
||||||
// issue a warning and reset rlogarg = epsilon
|
// issue a warning and reset rlogarg = epsilon
|
||||||
// if r > 2*r0 something serious is wrong, abort
|
// if r > 2*r0 something serious is wrong, abort
|
||||||
|
|
||||||
if (rlogarg < (flt_t)0.1) {
|
if (rlogarg < (flt_t)0.1) {
|
||||||
char str[128];
|
char str[128];
|
||||||
sprintf(str,"FENE bond too long: " BIGINT_FORMAT " "
|
sprintf(str,"FENE bond too long: " BIGINT_FORMAT " "
|
||||||
@ -186,18 +186,18 @@ void BondFENEIntel::eval(const int vflag,
|
|||||||
if (rlogarg <= (flt_t)-3.0) error->one(FLERR,"Bad FENE bond");
|
if (rlogarg <= (flt_t)-3.0) error->one(FLERR,"Bad FENE bond");
|
||||||
rlogarg = (flt_t)0.1;
|
rlogarg = (flt_t)0.1;
|
||||||
}
|
}
|
||||||
|
|
||||||
flt_t fbond = -k/rlogarg;
|
flt_t fbond = -k/rlogarg;
|
||||||
|
|
||||||
// force from LJ term
|
// force from LJ term
|
||||||
|
|
||||||
flt_t sr2,sr6;
|
flt_t sr2,sr6;
|
||||||
if (rsq < (flt_t)TWO_1_3*sigmasq) {
|
if (rsq < (flt_t)TWO_1_3*sigmasq) {
|
||||||
sr2 = sigmasq * irsq;
|
sr2 = sigmasq * irsq;
|
||||||
sr6 = sr2 * sr2 * sr2;
|
sr6 = sr2 * sr2 * sr2;
|
||||||
fbond += (flt_t)48.0 * epsilon * sr6 * (sr6 - (flt_t)0.5) * irsq;
|
fbond += (flt_t)48.0 * epsilon * sr6 * (sr6 - (flt_t)0.5) * irsq;
|
||||||
}
|
}
|
||||||
|
|
||||||
// energy
|
// energy
|
||||||
|
|
||||||
flt_t ebond;
|
flt_t ebond;
|
||||||
@ -215,27 +215,27 @@ void BondFENEIntel::eval(const int vflag,
|
|||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
f[i1].x += delx*fbond;
|
f[i1].x += delx*fbond;
|
||||||
f[i1].y += dely*fbond;
|
f[i1].y += dely*fbond;
|
||||||
f[i1].z += delz*fbond;
|
f[i1].z += delz*fbond;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i2 < nlocal) {
|
if (NEWTON_BOND || i2 < nlocal) {
|
||||||
f[i2].x -= delx*fbond;
|
f[i2].x -= delx*fbond;
|
||||||
f[i2].y -= dely*fbond;
|
f[i2].y -= dely*fbond;
|
||||||
f[i2].z -= delz*fbond;
|
f[i2].z -= delz*fbond;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (EFLAG || VFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond,
|
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond,
|
||||||
delx, dely, delz, sebond, f, NEWTON_BOND,
|
delx, dely, delz, sebond, f, NEWTON_BOND,
|
||||||
nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
|
nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
|
||||||
#else
|
#else
|
||||||
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond,
|
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond,
|
||||||
delx, dely, delz, oebond, f, NEWTON_BOND,
|
delx, dely, delz, oebond, f, NEWTON_BOND,
|
||||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
} // for n
|
} // for n
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
@ -250,7 +250,7 @@ void BondFENEIntel::eval(const int vflag,
|
|||||||
if (EFLAG) energy += oebond;
|
if (EFLAG) energy += oebond;
|
||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->set_reduce_flag();
|
fix->set_reduce_flag();
|
||||||
@ -307,11 +307,11 @@ void BondFENEIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
|
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
void BondFENEIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
|
void BondFENEIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
|
||||||
Memory *memory) {
|
Memory *memory) {
|
||||||
if (nbondtypes != _nbondtypes) {
|
if (nbondtypes != _nbondtypes) {
|
||||||
if (_nbondtypes > 0)
|
if (_nbondtypes > 0)
|
||||||
_memory->destroy(fc);
|
_memory->destroy(fc);
|
||||||
|
|
||||||
if (nbondtypes > 0)
|
if (nbondtypes > 0)
|
||||||
_memory->create(fc,nbondtypes,"bondfeneintel.fc");
|
_memory->create(fc,nbondtypes,"bondfeneintel.fc");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -45,8 +45,8 @@ class BondFENEIntel : public BondFENE {
|
|||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t, acc_t> *buffers);
|
IntelBuffers<flt_t, acc_t> *buffers);
|
||||||
|
|||||||
@ -33,7 +33,7 @@ typedef struct { int a,b,t; } int3_t;
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
BondHarmonicIntel::BondHarmonicIntel(LAMMPS *lmp) : BondHarmonic(lmp)
|
BondHarmonicIntel::BondHarmonicIntel(LAMMPS *lmp) : BondHarmonic(lmp)
|
||||||
{
|
{
|
||||||
suffix_flag |= Suffix::INTEL;
|
suffix_flag |= Suffix::INTEL;
|
||||||
}
|
}
|
||||||
@ -70,8 +70,8 @@ void BondHarmonicIntel::compute(int eflag, int vflag)
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void BondHarmonicIntel::compute(int eflag, int vflag,
|
void BondHarmonicIntel::compute(int eflag, int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
if (eflag || vflag) ev_setup(eflag,vflag);
|
if (eflag || vflag) ev_setup(eflag,vflag);
|
||||||
else evflag = 0;
|
else evflag = 0;
|
||||||
@ -79,14 +79,14 @@ void BondHarmonicIntel::compute(int eflag, int vflag,
|
|||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (vflag && !eflag) {
|
if (vflag && !eflag) {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<0,1,1>(vflag, buffers, fc);
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<0,1,0>(vflag, buffers, fc);
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -97,9 +97,9 @@ void BondHarmonicIntel::compute(int eflag, int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void BondHarmonicIntel::eval(const int vflag,
|
void BondHarmonicIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
const int inum = neighbor->nbondlist;
|
const int inum = neighbor->nbondlist;
|
||||||
if (inum == 0) return;
|
if (inum == 0) return;
|
||||||
@ -126,7 +126,7 @@ void BondHarmonicIntel::eval(const int vflag,
|
|||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel default(none) \
|
||||||
shared(f_start,f_stride,fc) \
|
shared(f_start,f_stride,fc) \
|
||||||
reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
|
reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
@ -141,7 +141,7 @@ void BondHarmonicIntel::eval(const int vflag,
|
|||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
memset(f, 0, f_stride * sizeof(FORCE_T));
|
memset(f, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
const int3_t * _noalias const bondlist =
|
const int3_t * _noalias const bondlist =
|
||||||
(int3_t *) neighbor->bondlist[0];
|
(int3_t *) neighbor->bondlist[0];
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
@ -184,29 +184,29 @@ void BondHarmonicIntel::eval(const int vflag,
|
|||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
f[i1].x += delx*fbond;
|
f[i1].x += delx*fbond;
|
||||||
f[i1].y += dely*fbond;
|
f[i1].y += dely*fbond;
|
||||||
f[i1].z += delz*fbond;
|
f[i1].z += delz*fbond;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i2 < nlocal) {
|
if (NEWTON_BOND || i2 < nlocal) {
|
||||||
f[i2].x -= delx*fbond;
|
f[i2].x -= delx*fbond;
|
||||||
f[i2].y -= dely*fbond;
|
f[i2].y -= dely*fbond;
|
||||||
f[i2].z -= delz*fbond;
|
f[i2].z -= delz*fbond;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (EFLAG || VFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2,
|
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2,
|
||||||
fbond, delx, dely, delz, sebond, f,
|
fbond, delx, dely, delz, sebond, f,
|
||||||
NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3,
|
NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3,
|
||||||
sv4, sv5);
|
sv4, sv5);
|
||||||
#else
|
#else
|
||||||
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2,
|
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2,
|
||||||
fbond, delx, dely, delz, oebond, f,
|
fbond, delx, dely, delz, oebond, f,
|
||||||
NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3,
|
NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3,
|
||||||
ov4, ov5);
|
ov4, ov5);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
} // for n
|
} // for n
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
@ -221,7 +221,7 @@ void BondHarmonicIntel::eval(const int vflag,
|
|||||||
if (EFLAG) energy += oebond;
|
if (EFLAG) energy += oebond;
|
||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->set_reduce_flag();
|
fix->set_reduce_flag();
|
||||||
@ -276,11 +276,11 @@ void BondHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
|
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
void BondHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
|
void BondHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
|
||||||
Memory *memory) {
|
Memory *memory) {
|
||||||
if (nbondtypes != _nbondtypes) {
|
if (nbondtypes != _nbondtypes) {
|
||||||
if (_nbondtypes > 0)
|
if (_nbondtypes > 0)
|
||||||
_memory->destroy(fc);
|
_memory->destroy(fc);
|
||||||
|
|
||||||
if (nbondtypes > 0)
|
if (nbondtypes > 0)
|
||||||
_memory->create(fc,nbondtypes,"bondharmonicintel.fc");
|
_memory->create(fc,nbondtypes,"bondharmonicintel.fc");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -45,8 +45,8 @@ class BondHarmonicIntel : public BondHarmonic {
|
|||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t, acc_t> *buffers);
|
IntelBuffers<flt_t, acc_t> *buffers);
|
||||||
|
|||||||
@ -80,8 +80,8 @@ void DihedralCharmmIntel::compute(int eflag, int vflag)
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void DihedralCharmmIntel::compute(int eflag, int vflag,
|
void DihedralCharmmIntel::compute(int eflag, int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
if (eflag || vflag) {
|
if (eflag || vflag) {
|
||||||
ev_setup(eflag,vflag);
|
ev_setup(eflag,vflag);
|
||||||
@ -95,14 +95,14 @@ void DihedralCharmmIntel::compute(int eflag, int vflag,
|
|||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (vflag && !eflag) {
|
if (vflag && !eflag) {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<0,1,1>(vflag, buffers, fc);
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<0,1,0>(vflag, buffers, fc);
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -115,9 +115,9 @@ void DihedralCharmmIntel::compute(int eflag, int vflag,
|
|||||||
#ifndef LMP_USE_AVXCD_DHC
|
#ifndef LMP_USE_AVXCD_DHC
|
||||||
|
|
||||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void DihedralCharmmIntel::eval(const int vflag,
|
void DihedralCharmmIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
|
|
||||||
{
|
{
|
||||||
const int inum = neighbor->ndihedrallist;
|
const int inum = neighbor->ndihedrallist;
|
||||||
@ -148,9 +148,9 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel default(none) \
|
||||||
shared(f_start,f_stride,fc) \
|
shared(f_start,f_stride,fc) \
|
||||||
reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
|
reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
|
||||||
opv0,opv1,opv2,opv3,opv4,opv5)
|
opv0,opv1,opv2,opv3,opv4,opv5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
#if defined(LMP_SIMD_COMPILER_TEST)
|
#if defined(LMP_SIMD_COMPILER_TEST)
|
||||||
@ -165,7 +165,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
memset(f, 0, f_stride * sizeof(FORCE_T));
|
memset(f, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
const int5_t * _noalias const dihedrallist =
|
const int5_t * _noalias const dihedrallist =
|
||||||
(int5_t *) neighbor->dihedrallist[0];
|
(int5_t *) neighbor->dihedrallist[0];
|
||||||
const flt_t qqrd2e = force->qqrd2e;
|
const flt_t qqrd2e = force->qqrd2e;
|
||||||
|
|
||||||
@ -180,7 +180,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
#if defined(LMP_SIMD_COMPILER_TEST)
|
#if defined(LMP_SIMD_COMPILER_TEST)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
|
#pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
|
||||||
sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5)
|
sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5)
|
||||||
for (int n = nfrom; n < nto; n++) {
|
for (int n = nfrom; n < nto; n++) {
|
||||||
#endif
|
#endif
|
||||||
for (int n = nfrom; n < nto; n += npl) {
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
@ -204,7 +204,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
const flt_t vb2zm = x[i2].z - x[i3].z;
|
const flt_t vb2zm = x[i2].z - x[i3].z;
|
||||||
|
|
||||||
// 3rd bond
|
// 3rd bond
|
||||||
|
|
||||||
const flt_t vb3x = x[i4].x - x[i3].x;
|
const flt_t vb3x = x[i4].x - x[i3].x;
|
||||||
const flt_t vb3y = x[i4].y - x[i3].y;
|
const flt_t vb3y = x[i4].y - x[i3].y;
|
||||||
const flt_t vb3z = x[i4].z - x[i3].z;
|
const flt_t vb3z = x[i4].z - x[i3].z;
|
||||||
@ -244,25 +244,25 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
// error check
|
// error check
|
||||||
#ifndef LMP_SIMD_COMPILER_TEST
|
#ifndef LMP_SIMD_COMPILER_TEST
|
||||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||||
int me = comm->me;
|
int me = comm->me;
|
||||||
|
|
||||||
if (screen) {
|
if (screen) {
|
||||||
char str[128];
|
char str[128];
|
||||||
sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
|
sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
|
||||||
TAGINT_FORMAT " " TAGINT_FORMAT " "
|
TAGINT_FORMAT " " TAGINT_FORMAT " "
|
||||||
TAGINT_FORMAT " " TAGINT_FORMAT,
|
TAGINT_FORMAT " " TAGINT_FORMAT,
|
||||||
me,tid,update->ntimestep,
|
me,tid,update->ntimestep,
|
||||||
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
|
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
|
||||||
error->warning(FLERR,str,0);
|
error->warning(FLERR,str,0);
|
||||||
fprintf(screen," 1st atom: %d %g %g %g\n",
|
fprintf(screen," 1st atom: %d %g %g %g\n",
|
||||||
me,x[i1].x,x[i1].y,x[i1].z);
|
me,x[i1].x,x[i1].y,x[i1].z);
|
||||||
fprintf(screen," 2nd atom: %d %g %g %g\n",
|
fprintf(screen," 2nd atom: %d %g %g %g\n",
|
||||||
me,x[i2].x,x[i2].y,x[i2].z);
|
me,x[i2].x,x[i2].y,x[i2].z);
|
||||||
fprintf(screen," 3rd atom: %d %g %g %g\n",
|
fprintf(screen," 3rd atom: %d %g %g %g\n",
|
||||||
me,x[i3].x,x[i3].y,x[i3].z);
|
me,x[i3].x,x[i3].y,x[i3].z);
|
||||||
fprintf(screen," 4th atom: %d %g %g %g\n",
|
fprintf(screen," 4th atom: %d %g %g %g\n",
|
||||||
me,x[i4].x,x[i4].y,x[i4].z);
|
me,x[i4].x,x[i4].y,x[i4].z);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -279,19 +279,19 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
ddf1 = df1 = (flt_t)0.0;
|
ddf1 = df1 = (flt_t)0.0;
|
||||||
|
|
||||||
for (int i = 0; i < m; i++) {
|
for (int i = 0; i < m; i++) {
|
||||||
ddf1 = p*c - df1*s;
|
ddf1 = p*c - df1*s;
|
||||||
df1 = p*s + df1*c;
|
df1 = p*s + df1*c;
|
||||||
p = ddf1;
|
p = ddf1;
|
||||||
}
|
}
|
||||||
|
|
||||||
p = p*tcos_shift + df1*tsin_shift;
|
p = p*tcos_shift + df1*tsin_shift;
|
||||||
df1 = df1*tcos_shift - ddf1*tsin_shift;
|
df1 = df1*tcos_shift - ddf1*tsin_shift;
|
||||||
df1 *= -m;
|
df1 *= -m;
|
||||||
p += (flt_t)1.0;
|
p += (flt_t)1.0;
|
||||||
|
|
||||||
if (m == 0) {
|
if (m == 0) {
|
||||||
p = (flt_t)1.0 + tcos_shift;
|
p = (flt_t)1.0 + tcos_shift;
|
||||||
df1 = (flt_t)0.0;
|
df1 = (flt_t)0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
|
const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
|
||||||
@ -334,12 +334,12 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
const flt_t f3z = -sz2 - f4z;
|
const flt_t f3z = -sz2 - f4z;
|
||||||
|
|
||||||
if (EFLAG || VFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
flt_t deng;
|
flt_t deng;
|
||||||
if (EFLAG) deng = tk * p;
|
if (EFLAG) deng = tk * p;
|
||||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3,
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3,
|
||||||
i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
|
i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
|
||||||
f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
|
f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
|
||||||
vb3x, vb3y, vb3z, sedihedral, f, NEWTON_BOND,
|
vb3x, vb3y, vb3z, sedihedral, f, NEWTON_BOND,
|
||||||
nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
|
nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -349,15 +349,15 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i2 < nlocal) {
|
if (NEWTON_BOND || i2 < nlocal) {
|
||||||
f[i2].x += f2x;
|
f[i2].x += f2x;
|
||||||
f[i2].y += f2y;
|
f[i2].y += f2y;
|
||||||
f[i2].z += f2z;
|
f[i2].z += f2z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i3 < nlocal) {
|
if (NEWTON_BOND || i3 < nlocal) {
|
||||||
f[i3].x += f3x;
|
f[i3].x += f3x;
|
||||||
f[i3].y += f3y;
|
f[i3].y += f3y;
|
||||||
f[i3].z += f3z;
|
f[i3].z += f3z;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -372,54 +372,54 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
flt_t forcecoul;
|
flt_t forcecoul;
|
||||||
if (implicit) forcecoul = qqrd2e * q[i1]*q[i4]*r2inv;
|
if (implicit) forcecoul = qqrd2e * q[i1]*q[i4]*r2inv;
|
||||||
else forcecoul = qqrd2e * q[i1]*q[i4]*sqrt(r2inv);
|
else forcecoul = qqrd2e * q[i1]*q[i4]*sqrt(r2inv);
|
||||||
const flt_t forcelj = r6inv * (fc.ljp[itype][jtype].lj1*r6inv -
|
const flt_t forcelj = r6inv * (fc.ljp[itype][jtype].lj1*r6inv -
|
||||||
fc.ljp[itype][jtype].lj2);
|
fc.ljp[itype][jtype].lj2);
|
||||||
const flt_t fpair = tweight * (forcelj+forcecoul)*r2inv;
|
const flt_t fpair = tweight * (forcelj+forcecoul)*r2inv;
|
||||||
|
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
f1x += delx*fpair;
|
f1x += delx*fpair;
|
||||||
f1y += dely*fpair;
|
f1y += dely*fpair;
|
||||||
f1z += delz*fpair;
|
f1z += delz*fpair;
|
||||||
}
|
}
|
||||||
if (NEWTON_BOND || i4 < nlocal) {
|
if (NEWTON_BOND || i4 < nlocal) {
|
||||||
f4x -= delx*fpair;
|
f4x -= delx*fpair;
|
||||||
f4y -= dely*fpair;
|
f4y -= dely*fpair;
|
||||||
f4z -= delz*fpair;
|
f4z -= delz*fpair;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (EFLAG || VFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
flt_t ev_pre = (flt_t)0;
|
flt_t ev_pre = (flt_t)0;
|
||||||
if (NEWTON_BOND || i1 < nlocal)
|
if (NEWTON_BOND || i1 < nlocal)
|
||||||
ev_pre += (flt_t)0.5;
|
ev_pre += (flt_t)0.5;
|
||||||
if (NEWTON_BOND || i4 < nlocal)
|
if (NEWTON_BOND || i4 < nlocal)
|
||||||
ev_pre += (flt_t)0.5;
|
ev_pre += (flt_t)0.5;
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
flt_t ecoul, evdwl;
|
flt_t ecoul, evdwl;
|
||||||
ecoul = tweight * forcecoul;
|
ecoul = tweight * forcecoul;
|
||||||
evdwl = tweight * r6inv * (fc.ljp[itype][jtype].lj3*r6inv -
|
evdwl = tweight * r6inv * (fc.ljp[itype][jtype].lj3*r6inv -
|
||||||
fc.ljp[itype][jtype].lj4);
|
fc.ljp[itype][jtype].lj4);
|
||||||
secoul += ev_pre * ecoul;
|
secoul += ev_pre * ecoul;
|
||||||
sevdwl += ev_pre * evdwl;
|
sevdwl += ev_pre * evdwl;
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
evdwl *= (flt_t)0.5;
|
evdwl *= (flt_t)0.5;
|
||||||
evdwl += (flt_t)0.5 * ecoul;
|
evdwl += (flt_t)0.5 * ecoul;
|
||||||
if (NEWTON_BOND || i1 < nlocal)
|
if (NEWTON_BOND || i1 < nlocal)
|
||||||
f[i1].w += evdwl;
|
f[i1].w += evdwl;
|
||||||
if (NEWTON_BOND || i4 < nlocal)
|
if (NEWTON_BOND || i4 < nlocal)
|
||||||
f[i4].w += evdwl;
|
f[i4].w += evdwl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
|
// IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
|
||||||
// delx, dely, delz);
|
// delx, dely, delz);
|
||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
spv0 += ev_pre * delx * delx * fpair;
|
spv0 += ev_pre * delx * delx * fpair;
|
||||||
spv1 += ev_pre * dely * dely * fpair;
|
spv1 += ev_pre * dely * dely * fpair;
|
||||||
spv2 += ev_pre * delz * delz * fpair;
|
spv2 += ev_pre * delz * delz * fpair;
|
||||||
spv3 += ev_pre * delx * dely * fpair;
|
spv3 += ev_pre * delx * dely * fpair;
|
||||||
spv4 += ev_pre * delx * delz * fpair;
|
spv4 += ev_pre * delx * delz * fpair;
|
||||||
spv5 += ev_pre * dely * delz * fpair;
|
spv5 += ev_pre * dely * delz * fpair;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// apply force to each of 4 atoms
|
// apply force to each of 4 atoms
|
||||||
@ -428,15 +428,15 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
f[i1].x += f1x;
|
f[i1].x += f1x;
|
||||||
f[i1].y += f1y;
|
f[i1].y += f1y;
|
||||||
f[i1].z += f1z;
|
f[i1].z += f1z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i4 < nlocal) {
|
if (NEWTON_BOND || i4 < nlocal) {
|
||||||
f[i4].x += f4x;
|
f[i4].x += f4x;
|
||||||
f[i4].y += f4y;
|
f[i4].y += f4y;
|
||||||
f[i4].z += f4z;
|
f[i4].z += f4z;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} // for n
|
} // for n
|
||||||
@ -447,7 +447,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
}
|
}
|
||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||||
opv0 += spv0; opv1 += spv1; opv2 += spv2;
|
opv0 += spv0; opv1 += spv1; opv2 += spv2;
|
||||||
opv3 += spv3; opv4 += spv4; opv5 += spv5;
|
opv3 += spv3; opv4 += spv4; opv5 += spv5;
|
||||||
}
|
}
|
||||||
} // omp parallel
|
} // omp parallel
|
||||||
@ -485,9 +485,9 @@ authors for more details.
|
|||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void DihedralCharmmIntel::eval(const int vflag,
|
void DihedralCharmmIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
|
|
||||||
{
|
{
|
||||||
typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t;
|
typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t;
|
||||||
@ -522,20 +522,20 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel default(none) \
|
||||||
shared(f_start,f_stride,fc) \
|
shared(f_start,f_stride,fc) \
|
||||||
reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
|
reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
|
||||||
opv0,opv1,opv2,opv3,opv4,opv5)
|
opv0,opv1,opv2,opv3,opv4,opv5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int nfrom, npl, nto, tid;
|
int nfrom, npl, nto, tid;
|
||||||
IP_PRE_omp_stride_id_vec(nfrom, npl, nto, tid, inum, nthreads,
|
IP_PRE_omp_stride_id_vec(nfrom, npl, nto, tid, inum, nthreads,
|
||||||
swidth);
|
swidth);
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
memset(f, 0, f_stride * sizeof(FORCE_T));
|
memset(f, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
const int * _noalias const dihedrallist =
|
const int * _noalias const dihedrallist =
|
||||||
(int *) neighbor->dihedrallist[0];
|
(int *) neighbor->dihedrallist[0];
|
||||||
const flt_t * _noalias const weight = &(fc.weight[0]);
|
const flt_t * _noalias const weight = &(fc.weight[0]);
|
||||||
const flt_t * _noalias const x_f = &(x[0].x);
|
const flt_t * _noalias const x_f = &(x[0].x);
|
||||||
@ -574,7 +574,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
SIMD_int n_offset = SIMD_set(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50,
|
SIMD_int n_offset = SIMD_set(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50,
|
||||||
55, 60, 65, 70, 75) + (nfrom * 5);
|
55, 60, 65, 70, 75) + (nfrom * 5);
|
||||||
const int nto5 = nto * 5;
|
const int nto5 = nto * 5;
|
||||||
const int nlocals4 = nlocal << 4;
|
const int nlocals4 = nlocal << 4;
|
||||||
const SIMD_int simd_nlocals4 = SIMD_set(nlocals4);
|
const SIMD_int simd_nlocals4 = SIMD_set(nlocals4);
|
||||||
@ -618,7 +618,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
const SIMD_flt_t vb2zm = z2 - z3;
|
const SIMD_flt_t vb2zm = z2 - z3;
|
||||||
|
|
||||||
// 3rd bond
|
// 3rd bond
|
||||||
|
|
||||||
SIMD_flt_t x4, y4, z4;
|
SIMD_flt_t x4, y4, z4;
|
||||||
SIMD_int jtype;
|
SIMD_int jtype;
|
||||||
|
|
||||||
@ -664,7 +664,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
const SIMD_flt_t ptol = SIMD_set(PTOLERANCE);
|
const SIMD_flt_t ptol = SIMD_set(PTOLERANCE);
|
||||||
const SIMD_flt_t ntol = SIMD_set(MTOLERANCE);
|
const SIMD_flt_t ntol = SIMD_set(MTOLERANCE);
|
||||||
if (c > ptol || c < ntol)
|
if (c > ptol || c < ntol)
|
||||||
if (screen)
|
if (screen)
|
||||||
error->warning(FLERR,"Dihedral problem.");
|
error->warning(FLERR,"Dihedral problem.");
|
||||||
|
|
||||||
c = SIMD_set(c, c > one, one);
|
c = SIMD_set(c, c > one, one);
|
||||||
@ -678,14 +678,14 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
SIMD_flt_t p(one);
|
SIMD_flt_t p(one);
|
||||||
SIMD_flt_t ddf1(szero);
|
SIMD_flt_t ddf1(szero);
|
||||||
SIMD_flt_t df1(szero);
|
SIMD_flt_t df1(szero);
|
||||||
|
|
||||||
const int m_max = SIMD_max(m);
|
const int m_max = SIMD_max(m);
|
||||||
|
|
||||||
for (int i = 0; i < m_max; i++) {
|
for (int i = 0; i < m_max; i++) {
|
||||||
const SIMD_mask my_m = i < m;
|
const SIMD_mask my_m = i < m;
|
||||||
ddf1 = SIMD_set(ddf1, my_m, p*c - df1*s);
|
ddf1 = SIMD_set(ddf1, my_m, p*c - df1*s);
|
||||||
df1 = SIMD_set(df1, my_m, p*s + df1*c);
|
df1 = SIMD_set(df1, my_m, p*s + df1*c);
|
||||||
p = SIMD_set(p, my_m, ddf1);
|
p = SIMD_set(p, my_m, ddf1);
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMD_flt_t multf;
|
SIMD_flt_t multf;
|
||||||
@ -694,7 +694,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
df1 = df1*tcos_shift - ddf1*tsin_shift;
|
df1 = df1*tcos_shift - ddf1*tsin_shift;
|
||||||
df1 = df1 * multf;
|
df1 = df1 * multf;
|
||||||
p = p + one;
|
p = p + one;
|
||||||
|
|
||||||
SIMD_mask mzero = (m == SIMD_set((int)0));
|
SIMD_mask mzero = (m == SIMD_set((int)0));
|
||||||
p = SIMD_set(p, mzero, one + tcos_shift);
|
p = SIMD_set(p, mzero, one + tcos_shift);
|
||||||
df1 = SIMD_set(df1, mzero, szero);
|
df1 = SIMD_set(df1, mzero, szero);
|
||||||
@ -740,40 +740,40 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
|
|
||||||
SIMD_flt_t qdeng;
|
SIMD_flt_t qdeng;
|
||||||
if (EFLAG || VFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
SIMD_flt_t ev_pre;
|
SIMD_flt_t ev_pre;
|
||||||
if (NEWTON_BOND) ev_pre = one;
|
if (NEWTON_BOND) ev_pre = one;
|
||||||
else {
|
else {
|
||||||
ev_pre = szero;
|
ev_pre = szero;
|
||||||
const SIMD_flt_t quarter = SIMD_set((flt_t)0.25);
|
const SIMD_flt_t quarter = SIMD_set((flt_t)0.25);
|
||||||
ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4, ev_pre, quarter);
|
ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4, ev_pre, quarter);
|
||||||
ev_pre = SIMD_add(ev_pre, i2 < simd_nlocals4, ev_pre, quarter);
|
ev_pre = SIMD_add(ev_pre, i2 < simd_nlocals4, ev_pre, quarter);
|
||||||
ev_pre = SIMD_add(ev_pre, i3 < simd_nlocals4, ev_pre, quarter);
|
ev_pre = SIMD_add(ev_pre, i3 < simd_nlocals4, ev_pre, quarter);
|
||||||
ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4, ev_pre, quarter);
|
ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4, ev_pre, quarter);
|
||||||
}
|
}
|
||||||
SIMD_zero_masked(nmask, ev_pre);
|
SIMD_zero_masked(nmask, ev_pre);
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
const SIMD_flt_t deng = tk * p;
|
const SIMD_flt_t deng = tk * p;
|
||||||
sedihedral = SIMD_ev_add(sedihedral, ev_pre * deng);
|
sedihedral = SIMD_ev_add(sedihedral, ev_pre * deng);
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
qdeng = deng * SIMD_set((flt_t)0.25);
|
qdeng = deng * SIMD_set((flt_t)0.25);
|
||||||
SIMD_mask newton_mask;
|
SIMD_mask newton_mask;
|
||||||
if (NEWTON_BOND) newton_mask = nmask;
|
if (NEWTON_BOND) newton_mask = nmask;
|
||||||
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4);
|
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4);
|
||||||
SIMD_flt_t ieng = qdeng;
|
SIMD_flt_t ieng = qdeng;
|
||||||
SIMD_jeng_update(newton_mask, featom, i2, ieng);
|
SIMD_jeng_update(newton_mask, featom, i2, ieng);
|
||||||
ieng = qdeng;
|
ieng = qdeng;
|
||||||
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4);
|
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4);
|
||||||
SIMD_jeng_update(newton_mask, featom, i3, ieng);
|
SIMD_jeng_update(newton_mask, featom, i3, ieng);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
sv0 = SIMD_ev_add(sv0, ev_pre*(vb1x*f1x-vb2xm*f3x+(vb3x-vb2xm)*f4x));
|
sv0 = SIMD_ev_add(sv0, ev_pre*(vb1x*f1x-vb2xm*f3x+(vb3x-vb2xm)*f4x));
|
||||||
sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y));
|
sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y));
|
||||||
sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z));
|
sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z));
|
||||||
sv3 = SIMD_ev_add(sv3, ev_pre*(vb1x*f1y-vb2xm*f3y+(vb3x-vb2xm)*f4y));
|
sv3 = SIMD_ev_add(sv3, ev_pre*(vb1x*f1y-vb2xm*f3y+(vb3x-vb2xm)*f4y));
|
||||||
sv4 = SIMD_ev_add(sv4, ev_pre*(vb1x*f1z-vb2xm*f3z+(vb3x-vb2xm)*f4z));
|
sv4 = SIMD_ev_add(sv4, ev_pre*(vb1x*f1z-vb2xm*f3z+(vb3x-vb2xm)*f4z));
|
||||||
sv5 = SIMD_ev_add(sv5, ev_pre*(vb1y*f1z-vb2ym*f3z+(vb3y-vb2ym)*f4z));
|
sv5 = SIMD_ev_add(sv5, ev_pre*(vb1y*f1z-vb2ym*f3z+(vb3y-vb2ym)*f4z));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMD_mask newton_mask;
|
SIMD_mask newton_mask;
|
||||||
@ -809,27 +809,27 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
f4z = f4z - delz * fpair;
|
f4z = f4z - delz * fpair;
|
||||||
|
|
||||||
if (EFLAG || VFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
SIMD_flt_t ev_pre;
|
SIMD_flt_t ev_pre;
|
||||||
if (NEWTON_BOND) ev_pre = one;
|
if (NEWTON_BOND) ev_pre = one;
|
||||||
else {
|
else {
|
||||||
ev_pre = szero;
|
ev_pre = szero;
|
||||||
const SIMD_flt_t half = SIMD_set((flt_t)0.5);
|
const SIMD_flt_t half = SIMD_set((flt_t)0.5);
|
||||||
ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4,ev_pre,half);
|
ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4,ev_pre,half);
|
||||||
ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4,ev_pre,half);
|
ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4,ev_pre,half);
|
||||||
}
|
}
|
||||||
SIMD_zero_masked(nmask, ev_pre);
|
SIMD_zero_masked(nmask, ev_pre);
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
const SIMD_flt_t ecoul = tweight * forcecoul;
|
const SIMD_flt_t ecoul = tweight * forcecoul;
|
||||||
const SIMD_flt_t lj3 = SIMD_gather(nmask, plj3, ijtype);
|
const SIMD_flt_t lj3 = SIMD_gather(nmask, plj3, ijtype);
|
||||||
const SIMD_flt_t lj4 = SIMD_gather(nmask, plj4, ijtype);
|
const SIMD_flt_t lj4 = SIMD_gather(nmask, plj4, ijtype);
|
||||||
SIMD_flt_t evdwl = tweight * r6inv * (lj3 * r6inv - lj4);
|
SIMD_flt_t evdwl = tweight * r6inv * (lj3 * r6inv - lj4);
|
||||||
secoul = SIMD_ev_add(secoul, ev_pre * ecoul);
|
secoul = SIMD_ev_add(secoul, ev_pre * ecoul);
|
||||||
sevdwl = SIMD_ev_add(sevdwl, ev_pre * evdwl);
|
sevdwl = SIMD_ev_add(sevdwl, ev_pre * evdwl);
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
const SIMD_flt_t half = SIMD_set((flt_t)0.5);
|
const SIMD_flt_t half = SIMD_set((flt_t)0.5);
|
||||||
evdwl = evdwl * half;
|
evdwl = evdwl * half;
|
||||||
evdwl = evdwl + half * ecoul + qdeng;
|
evdwl = evdwl + half * ecoul + qdeng;
|
||||||
|
|
||||||
if (NEWTON_BOND) newton_mask = nmask;
|
if (NEWTON_BOND) newton_mask = nmask;
|
||||||
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i1, simd_nlocals4);
|
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i1, simd_nlocals4);
|
||||||
@ -838,16 +838,16 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
ieng = evdwl;
|
ieng = evdwl;
|
||||||
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i4, simd_nlocals4);
|
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i4, simd_nlocals4);
|
||||||
SIMD_jeng_update(newton_mask, featom, i4, ieng);
|
SIMD_jeng_update(newton_mask, featom, i4, ieng);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
spv0 = SIMD_ev_add(spv0, ev_pre * delx * delx * fpair);
|
spv0 = SIMD_ev_add(spv0, ev_pre * delx * delx * fpair);
|
||||||
spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair);
|
spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair);
|
||||||
spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair);
|
spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair);
|
||||||
spv3 = SIMD_ev_add(spv3, ev_pre * delx * dely * fpair);
|
spv3 = SIMD_ev_add(spv3, ev_pre * delx * dely * fpair);
|
||||||
spv4 = SIMD_ev_add(spv4, ev_pre * delx * delz * fpair);
|
spv4 = SIMD_ev_add(spv4, ev_pre * delx * delz * fpair);
|
||||||
spv5 = SIMD_ev_add(spv5, ev_pre * dely * delz * fpair);
|
spv5 = SIMD_ev_add(spv5, ev_pre * dely * delz * fpair);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND) newton_mask = nmask;
|
if (NEWTON_BOND) newton_mask = nmask;
|
||||||
@ -863,17 +863,17 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
oevdwl += SIMD_sum(sevdwl);
|
oevdwl += SIMD_sum(sevdwl);
|
||||||
}
|
}
|
||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
ov0 += SIMD_sum(sv0);
|
ov0 += SIMD_sum(sv0);
|
||||||
ov1 += SIMD_sum(sv1);
|
ov1 += SIMD_sum(sv1);
|
||||||
ov2 += SIMD_sum(sv2);
|
ov2 += SIMD_sum(sv2);
|
||||||
ov3 += SIMD_sum(sv3);
|
ov3 += SIMD_sum(sv3);
|
||||||
ov4 += SIMD_sum(sv4);
|
ov4 += SIMD_sum(sv4);
|
||||||
ov5 += SIMD_sum(sv5);
|
ov5 += SIMD_sum(sv5);
|
||||||
opv0 += SIMD_sum(spv0);
|
opv0 += SIMD_sum(spv0);
|
||||||
opv1 += SIMD_sum(spv1);
|
opv1 += SIMD_sum(spv1);
|
||||||
opv2 += SIMD_sum(spv2);
|
opv2 += SIMD_sum(spv2);
|
||||||
opv3 += SIMD_sum(spv3);
|
opv3 += SIMD_sum(spv3);
|
||||||
opv4 += SIMD_sum(spv4);
|
opv4 += SIMD_sum(spv4);
|
||||||
opv5 += SIMD_sum(spv5);
|
opv5 += SIMD_sum(spv5);
|
||||||
}
|
}
|
||||||
} // omp parallel
|
} // omp parallel
|
||||||
@ -933,7 +933,7 @@ void DihedralCharmmIntel::init_style()
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
|
void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers)
|
IntelBuffers<flt_t,acc_t> *buffers)
|
||||||
{
|
{
|
||||||
|
|
||||||
const int tp1 = atom->ntypes + 1;
|
const int tp1 = atom->ntypes + 1;
|
||||||
@ -944,10 +944,10 @@ void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
if (weightflag) {
|
if (weightflag) {
|
||||||
for (int i = 0; i < tp1; i++) {
|
for (int i = 0; i < tp1; i++) {
|
||||||
for (int j = 0; j < tp1; j++) {
|
for (int j = 0; j < tp1; j++) {
|
||||||
fc.ljp[i][j].lj1 = lj14_1[i][j];
|
fc.ljp[i][j].lj1 = lj14_1[i][j];
|
||||||
fc.ljp[i][j].lj2 = lj14_2[i][j];
|
fc.ljp[i][j].lj2 = lj14_2[i][j];
|
||||||
fc.ljp[i][j].lj3 = lj14_3[i][j];
|
fc.ljp[i][j].lj3 = lj14_3[i][j];
|
||||||
fc.ljp[i][j].lj4 = lj14_4[i][j];
|
fc.ljp[i][j].lj4 = lj14_4[i][j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -965,8 +965,8 @@ void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
|
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
void DihedralCharmmIntel::ForceConst<flt_t>::set_ntypes(const int npairtypes,
|
void DihedralCharmmIntel::ForceConst<flt_t>::set_ntypes(const int npairtypes,
|
||||||
const int nbondtypes,
|
const int nbondtypes,
|
||||||
Memory *memory) {
|
Memory *memory) {
|
||||||
if (npairtypes != _npairtypes) {
|
if (npairtypes != _npairtypes) {
|
||||||
if (_npairtypes > 0)
|
if (_npairtypes > 0)
|
||||||
_memory->destroy(ljp);
|
_memory->destroy(ljp);
|
||||||
@ -979,7 +979,7 @@ void DihedralCharmmIntel::ForceConst<flt_t>::set_ntypes(const int npairtypes,
|
|||||||
_memory->destroy(bp);
|
_memory->destroy(bp);
|
||||||
_memory->destroy(weight);
|
_memory->destroy(weight);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nbondtypes > 0) {
|
if (nbondtypes > 0) {
|
||||||
_memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
|
_memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
|
||||||
_memory->create(weight,nbondtypes,"dihedralcharmmintel.weight");
|
_memory->create(weight,nbondtypes,"dihedralcharmmintel.weight");
|
||||||
|
|||||||
@ -44,8 +44,8 @@ class DihedralCharmmIntel : public DihedralCharmm {
|
|||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t, acc_t> *buffers);
|
IntelBuffers<flt_t, acc_t> *buffers);
|
||||||
@ -58,7 +58,7 @@ class DihedralCharmmIntel : public DihedralCharmm {
|
|||||||
class ForceConst {
|
class ForceConst {
|
||||||
public:
|
public:
|
||||||
typedef struct { flt_t lj1, lj2, lj3, lj4; } fc_packed1;
|
typedef struct { flt_t lj1, lj2, lj3, lj4; } fc_packed1;
|
||||||
typedef struct { flt_t cos_shift, sin_shift, k;
|
typedef struct { flt_t cos_shift, sin_shift, k;
|
||||||
int multiplicity; } fc_packed3;
|
int multiplicity; } fc_packed3;
|
||||||
|
|
||||||
fc_packed1 **ljp;
|
fc_packed1 **ljp;
|
||||||
|
|||||||
@ -69,8 +69,8 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag)
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void DihedralHarmonicIntel::compute(int eflag, int vflag,
|
void DihedralHarmonicIntel::compute(int eflag, int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
if (eflag || vflag) {
|
if (eflag || vflag) {
|
||||||
ev_setup(eflag,vflag);
|
ev_setup(eflag,vflag);
|
||||||
@ -79,14 +79,14 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag,
|
|||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (vflag && !eflag) {
|
if (vflag && !eflag) {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<0,1,1>(vflag, buffers, fc);
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<0,1,0>(vflag, buffers, fc);
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -97,9 +97,9 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void DihedralHarmonicIntel::eval(const int vflag,
|
void DihedralHarmonicIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
|
|
||||||
{
|
{
|
||||||
const int inum = neighbor->ndihedrallist;
|
const int inum = neighbor->ndihedrallist;
|
||||||
@ -127,7 +127,7 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
|||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel default(none) \
|
||||||
shared(f_start,f_stride,fc) \
|
shared(f_start,f_stride,fc) \
|
||||||
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
|
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
@ -142,7 +142,7 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
|||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
memset(f, 0, f_stride * sizeof(FORCE_T));
|
memset(f, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
const int5_t * _noalias const dihedrallist =
|
const int5_t * _noalias const dihedrallist =
|
||||||
(int5_t *) neighbor->dihedrallist[0];
|
(int5_t *) neighbor->dihedrallist[0];
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
@ -175,7 +175,7 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
|||||||
const flt_t vb2zm = x[i2].z - x[i3].z;
|
const flt_t vb2zm = x[i2].z - x[i3].z;
|
||||||
|
|
||||||
// 3rd bond
|
// 3rd bond
|
||||||
|
|
||||||
const flt_t vb3x = x[i4].x - x[i3].x;
|
const flt_t vb3x = x[i4].x - x[i3].x;
|
||||||
const flt_t vb3y = x[i4].y - x[i3].y;
|
const flt_t vb3y = x[i4].y - x[i3].y;
|
||||||
const flt_t vb3z = x[i4].z - x[i3].z;
|
const flt_t vb3z = x[i4].z - x[i3].z;
|
||||||
@ -207,25 +207,25 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
|||||||
// error check
|
// error check
|
||||||
#ifndef LMP_INTEL_USE_SIMDOFF
|
#ifndef LMP_INTEL_USE_SIMDOFF
|
||||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||||
int me = comm->me;
|
int me = comm->me;
|
||||||
|
|
||||||
if (screen) {
|
if (screen) {
|
||||||
char str[128];
|
char str[128];
|
||||||
sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
|
sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
|
||||||
TAGINT_FORMAT " " TAGINT_FORMAT " "
|
TAGINT_FORMAT " " TAGINT_FORMAT " "
|
||||||
TAGINT_FORMAT " " TAGINT_FORMAT,
|
TAGINT_FORMAT " " TAGINT_FORMAT,
|
||||||
me,tid,update->ntimestep,
|
me,tid,update->ntimestep,
|
||||||
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
|
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
|
||||||
error->warning(FLERR,str,0);
|
error->warning(FLERR,str,0);
|
||||||
fprintf(screen," 1st atom: %d %g %g %g\n",
|
fprintf(screen," 1st atom: %d %g %g %g\n",
|
||||||
me,x[i1].x,x[i1].y,x[i1].z);
|
me,x[i1].x,x[i1].y,x[i1].z);
|
||||||
fprintf(screen," 2nd atom: %d %g %g %g\n",
|
fprintf(screen," 2nd atom: %d %g %g %g\n",
|
||||||
me,x[i2].x,x[i2].y,x[i2].z);
|
me,x[i2].x,x[i2].y,x[i2].z);
|
||||||
fprintf(screen," 3rd atom: %d %g %g %g\n",
|
fprintf(screen," 3rd atom: %d %g %g %g\n",
|
||||||
me,x[i3].x,x[i3].y,x[i3].z);
|
me,x[i3].x,x[i3].y,x[i3].z);
|
||||||
fprintf(screen," 4th atom: %d %g %g %g\n",
|
fprintf(screen," 4th atom: %d %g %g %g\n",
|
||||||
me,x[i4].x,x[i4].y,x[i4].z);
|
me,x[i4].x,x[i4].y,x[i4].z);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -242,19 +242,19 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
|||||||
ddf1 = df1 = (flt_t)0.0;
|
ddf1 = df1 = (flt_t)0.0;
|
||||||
|
|
||||||
for (int i = 0; i < m; i++) {
|
for (int i = 0; i < m; i++) {
|
||||||
ddf1 = p*c - df1*s;
|
ddf1 = p*c - df1*s;
|
||||||
df1 = p*s + df1*c;
|
df1 = p*s + df1*c;
|
||||||
p = ddf1;
|
p = ddf1;
|
||||||
}
|
}
|
||||||
|
|
||||||
p = p*tcos_shift + df1*tsin_shift;
|
p = p*tcos_shift + df1*tsin_shift;
|
||||||
df1 = df1*tcos_shift - ddf1*tsin_shift;
|
df1 = df1*tcos_shift - ddf1*tsin_shift;
|
||||||
df1 *= -m;
|
df1 *= -m;
|
||||||
p += (flt_t)1.0;
|
p += (flt_t)1.0;
|
||||||
|
|
||||||
if (m == 0) {
|
if (m == 0) {
|
||||||
p = (flt_t)1.0 + tcos_shift;
|
p = (flt_t)1.0 + tcos_shift;
|
||||||
df1 = (flt_t)0.0;
|
df1 = (flt_t)0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
|
const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
|
||||||
@ -297,20 +297,20 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
|||||||
const flt_t f3z = -sz2 - f4z;
|
const flt_t f3z = -sz2 - f4z;
|
||||||
|
|
||||||
if (EFLAG || VFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
flt_t deng;
|
flt_t deng;
|
||||||
if (EFLAG) deng = tk * p;
|
if (EFLAG) deng = tk * p;
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
|
||||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||||
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||||
vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5);
|
sv0, sv1, sv2, sv3, sv4, sv5);
|
||||||
#else
|
#else
|
||||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
|
||||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||||
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||||
vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
|
vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
|
||||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -319,35 +319,35 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
|||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
f[i1].x += f1x;
|
f[i1].x += f1x;
|
||||||
f[i1].y += f1y;
|
f[i1].y += f1y;
|
||||||
f[i1].z += f1z;
|
f[i1].z += f1z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i2 < nlocal) {
|
if (NEWTON_BOND || i2 < nlocal) {
|
||||||
f[i2].x += f2x;
|
f[i2].x += f2x;
|
||||||
f[i2].y += f2y;
|
f[i2].y += f2y;
|
||||||
f[i2].z += f2z;
|
f[i2].z += f2z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i3 < nlocal) {
|
if (NEWTON_BOND || i3 < nlocal) {
|
||||||
f[i3].x += f3x;
|
f[i3].x += f3x;
|
||||||
f[i3].y += f3y;
|
f[i3].y += f3y;
|
||||||
f[i3].z += f3z;
|
f[i3].z += f3z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i4 < nlocal) {
|
if (NEWTON_BOND || i4 < nlocal) {
|
||||||
f[i4].x += f4x;
|
f[i4].x += f4x;
|
||||||
f[i4].y += f4y;
|
f[i4].y += f4y;
|
||||||
f[i4].z += f4z;
|
f[i4].z += f4z;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} // for n
|
} // for n
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
if (EFLAG) oedihedral += sedihedral;
|
if (EFLAG) oedihedral += sedihedral;
|
||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||||
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // omp parallel
|
} // omp parallel
|
||||||
@ -395,7 +395,7 @@ void DihedralHarmonicIntel::init_style()
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void DihedralHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
|
void DihedralHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers)
|
IntelBuffers<flt_t,acc_t> *buffers)
|
||||||
{
|
{
|
||||||
const int bp1 = atom->ndihedraltypes + 1;
|
const int bp1 = atom->ndihedraltypes + 1;
|
||||||
fc.set_ntypes(bp1,memory);
|
fc.set_ntypes(bp1,memory);
|
||||||
@ -412,11 +412,11 @@ void DihedralHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
|
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
void DihedralHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
|
void DihedralHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
|
||||||
Memory *memory) {
|
Memory *memory) {
|
||||||
if (nbondtypes != _nbondtypes) {
|
if (nbondtypes != _nbondtypes) {
|
||||||
if (_nbondtypes > 0)
|
if (_nbondtypes > 0)
|
||||||
_memory->destroy(bp);
|
_memory->destroy(bp);
|
||||||
|
|
||||||
if (nbondtypes > 0)
|
if (nbondtypes > 0)
|
||||||
_memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
|
_memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -44,8 +44,8 @@ class DihedralHarmonicIntel : public DihedralHarmonic {
|
|||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t, acc_t> *buffers);
|
IntelBuffers<flt_t, acc_t> *buffers);
|
||||||
@ -57,7 +57,7 @@ class DihedralHarmonicIntel : public DihedralHarmonic {
|
|||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
class ForceConst {
|
class ForceConst {
|
||||||
public:
|
public:
|
||||||
typedef struct { flt_t cos_shift, sin_shift, k;
|
typedef struct { flt_t cos_shift, sin_shift, k;
|
||||||
int multiplicity; } fc_packed1;
|
int multiplicity; } fc_packed1;
|
||||||
|
|
||||||
fc_packed1 *bp;
|
fc_packed1 *bp;
|
||||||
|
|||||||
@ -73,8 +73,8 @@ void DihedralOPLSIntel::compute(int eflag, int vflag)
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void DihedralOPLSIntel::compute(int eflag, int vflag,
|
void DihedralOPLSIntel::compute(int eflag, int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
if (eflag || vflag) {
|
if (eflag || vflag) {
|
||||||
ev_setup(eflag,vflag);
|
ev_setup(eflag,vflag);
|
||||||
@ -83,14 +83,14 @@ void DihedralOPLSIntel::compute(int eflag, int vflag,
|
|||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (vflag && !eflag) {
|
if (vflag && !eflag) {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<0,1,1>(vflag, buffers, fc);
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<0,1,0>(vflag, buffers, fc);
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -101,9 +101,9 @@ void DihedralOPLSIntel::compute(int eflag, int vflag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void DihedralOPLSIntel::eval(const int vflag,
|
void DihedralOPLSIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
|
|
||||||
{
|
{
|
||||||
const int inum = neighbor->ndihedrallist;
|
const int inum = neighbor->ndihedrallist;
|
||||||
@ -131,7 +131,7 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel default(none) \
|
||||||
shared(f_start,f_stride,fc) \
|
shared(f_start,f_stride,fc) \
|
||||||
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
|
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
@ -146,7 +146,7 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
memset(f, 0, f_stride * sizeof(FORCE_T));
|
memset(f, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
const int5_t * _noalias const dihedrallist =
|
const int5_t * _noalias const dihedrallist =
|
||||||
(int5_t *) neighbor->dihedrallist[0];
|
(int5_t *) neighbor->dihedrallist[0];
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
@ -179,7 +179,7 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
const flt_t vb2zm = x[i2].z - x[i3].z;
|
const flt_t vb2zm = x[i2].z - x[i3].z;
|
||||||
|
|
||||||
// 3rd bond
|
// 3rd bond
|
||||||
|
|
||||||
const flt_t vb3x = x[i4].x - x[i3].x;
|
const flt_t vb3x = x[i4].x - x[i3].x;
|
||||||
const flt_t vb3y = x[i4].y - x[i3].y;
|
const flt_t vb3y = x[i4].y - x[i3].y;
|
||||||
const flt_t vb3z = x[i4].z - x[i3].z;
|
const flt_t vb3z = x[i4].z - x[i3].z;
|
||||||
@ -209,7 +209,7 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
const flt_t c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
|
const flt_t c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
|
||||||
|
|
||||||
flt_t ctmp = -vb1x*vb2xm - vb1y*vb2ym - vb1z*vb2zm;
|
flt_t ctmp = -vb1x*vb2xm - vb1y*vb2ym - vb1z*vb2zm;
|
||||||
const flt_t r12c1 = rb1 * rb2;
|
const flt_t r12c1 = rb1 * rb2;
|
||||||
const flt_t c1mag = ctmp * r12c1;
|
const flt_t c1mag = ctmp * r12c1;
|
||||||
|
|
||||||
ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
|
ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
|
||||||
@ -240,25 +240,25 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
// error check
|
// error check
|
||||||
#ifndef LMP_INTEL_USE_SIMDOFF
|
#ifndef LMP_INTEL_USE_SIMDOFF
|
||||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||||
int me = comm->me;
|
int me = comm->me;
|
||||||
|
|
||||||
if (screen) {
|
if (screen) {
|
||||||
char str[128];
|
char str[128];
|
||||||
sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
|
sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
|
||||||
TAGINT_FORMAT " " TAGINT_FORMAT " "
|
TAGINT_FORMAT " " TAGINT_FORMAT " "
|
||||||
TAGINT_FORMAT " " TAGINT_FORMAT,
|
TAGINT_FORMAT " " TAGINT_FORMAT,
|
||||||
me,tid,update->ntimestep,
|
me,tid,update->ntimestep,
|
||||||
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
|
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
|
||||||
error->warning(FLERR,str,0);
|
error->warning(FLERR,str,0);
|
||||||
fprintf(screen," 1st atom: %d %g %g %g\n",
|
fprintf(screen," 1st atom: %d %g %g %g\n",
|
||||||
me,x[i1].x,x[i1].y,x[i1].z);
|
me,x[i1].x,x[i1].y,x[i1].z);
|
||||||
fprintf(screen," 2nd atom: %d %g %g %g\n",
|
fprintf(screen," 2nd atom: %d %g %g %g\n",
|
||||||
me,x[i2].x,x[i2].y,x[i2].z);
|
me,x[i2].x,x[i2].y,x[i2].z);
|
||||||
fprintf(screen," 3rd atom: %d %g %g %g\n",
|
fprintf(screen," 3rd atom: %d %g %g %g\n",
|
||||||
me,x[i3].x,x[i3].y,x[i3].z);
|
me,x[i3].x,x[i3].y,x[i3].z);
|
||||||
fprintf(screen," 4th atom: %d %g %g %g\n",
|
fprintf(screen," 4th atom: %d %g %g %g\n",
|
||||||
me,x[i4].x,x[i4].y,x[i4].z);
|
me,x[i4].x,x[i4].y,x[i4].z);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -283,14 +283,14 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
const flt_t sin_4phim = (flt_t)2.0 * cos_2phi * sin_2phim;
|
const flt_t sin_4phim = (flt_t)2.0 * cos_2phi * sin_2phim;
|
||||||
|
|
||||||
flt_t p, pd;
|
flt_t p, pd;
|
||||||
p = fc.bp[type].k1*((flt_t)1.0 + c) +
|
p = fc.bp[type].k1*((flt_t)1.0 + c) +
|
||||||
fc.bp[type].k2*((flt_t)1.0 - cos_2phi) +
|
fc.bp[type].k2*((flt_t)1.0 - cos_2phi) +
|
||||||
fc.bp[type].k3*((flt_t)1.0 + cos_3phi) +
|
fc.bp[type].k3*((flt_t)1.0 + cos_3phi) +
|
||||||
fc.bp[type].k4*((flt_t)1.0 - cos_4phi) ;
|
fc.bp[type].k4*((flt_t)1.0 - cos_4phi) ;
|
||||||
pd = fc.bp[type].k1 -
|
pd = fc.bp[type].k1 -
|
||||||
(flt_t)2.0 * fc.bp[type].k2 * sin_2phim +
|
(flt_t)2.0 * fc.bp[type].k2 * sin_2phim +
|
||||||
(flt_t)3.0 * fc.bp[type].k3 * sin_3phim -
|
(flt_t)3.0 * fc.bp[type].k3 * sin_3phim -
|
||||||
(flt_t)4.0 * fc.bp[type].k4 * sin_4phim;
|
(flt_t)4.0 * fc.bp[type].k4 * sin_4phim;
|
||||||
|
|
||||||
flt_t edihed;
|
flt_t edihed;
|
||||||
if (EFLAG) edihed = p;
|
if (EFLAG) edihed = p;
|
||||||
@ -327,18 +327,18 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
|
|
||||||
if (EFLAG || VFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3,
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3,
|
||||||
i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||||
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||||
vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5);
|
sv0, sv1, sv2, sv3, sv4, sv5);
|
||||||
#else
|
#else
|
||||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3,
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3,
|
||||||
i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||||
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||||
vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
|
vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
|
||||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
@ -346,35 +346,35 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
f[i1].x += f1x;
|
f[i1].x += f1x;
|
||||||
f[i1].y += f1y;
|
f[i1].y += f1y;
|
||||||
f[i1].z += f1z;
|
f[i1].z += f1z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i2 < nlocal) {
|
if (NEWTON_BOND || i2 < nlocal) {
|
||||||
f[i2].x += f2x;
|
f[i2].x += f2x;
|
||||||
f[i2].y += f2y;
|
f[i2].y += f2y;
|
||||||
f[i2].z += f2z;
|
f[i2].z += f2z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i3 < nlocal) {
|
if (NEWTON_BOND || i3 < nlocal) {
|
||||||
f[i3].x += f3x;
|
f[i3].x += f3x;
|
||||||
f[i3].y += f3y;
|
f[i3].y += f3y;
|
||||||
f[i3].z += f3z;
|
f[i3].z += f3z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i4 < nlocal) {
|
if (NEWTON_BOND || i4 < nlocal) {
|
||||||
f[i4].x += f4x;
|
f[i4].x += f4x;
|
||||||
f[i4].y += f4y;
|
f[i4].y += f4y;
|
||||||
f[i4].z += f4z;
|
f[i4].z += f4z;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} // for n
|
} // for n
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
if (EFLAG) oedihedral += sedihedral;
|
if (EFLAG) oedihedral += sedihedral;
|
||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||||
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // omp parallel
|
} // omp parallel
|
||||||
@ -422,7 +422,7 @@ void DihedralOPLSIntel::init_style()
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc,
|
void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers)
|
IntelBuffers<flt_t,acc_t> *buffers)
|
||||||
{
|
{
|
||||||
const int bp1 = atom->ndihedraltypes + 1;
|
const int bp1 = atom->ndihedraltypes + 1;
|
||||||
fc.set_ntypes(bp1,memory);
|
fc.set_ntypes(bp1,memory);
|
||||||
@ -439,11 +439,11 @@ void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
|
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
void DihedralOPLSIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
|
void DihedralOPLSIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
|
||||||
Memory *memory) {
|
Memory *memory) {
|
||||||
if (nbondtypes != _nbondtypes) {
|
if (nbondtypes != _nbondtypes) {
|
||||||
if (_nbondtypes > 0)
|
if (_nbondtypes > 0)
|
||||||
_memory->destroy(bp);
|
_memory->destroy(bp);
|
||||||
|
|
||||||
if (nbondtypes > 0)
|
if (nbondtypes > 0)
|
||||||
_memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
|
_memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -44,8 +44,8 @@ class DihedralOPLSIntel : public DihedralOPLS {
|
|||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t, acc_t> *buffers);
|
IntelBuffers<flt_t, acc_t> *buffers);
|
||||||
|
|||||||
@ -96,7 +96,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
|
|||||||
_allow_separate_buffers = 1;
|
_allow_separate_buffers = 1;
|
||||||
_offload_ghost = -1;
|
_offload_ghost = -1;
|
||||||
_lrt = 0;
|
_lrt = 0;
|
||||||
|
|
||||||
int iarg = 4;
|
int iarg = 4;
|
||||||
while (iarg < narg) {
|
while (iarg < narg) {
|
||||||
if (strcmp(arg[iarg],"omp") == 0) {
|
if (strcmp(arg[iarg],"omp") == 0) {
|
||||||
@ -141,7 +141,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
|
|||||||
else error->all(FLERR,"Illegal package intel command");
|
else error->all(FLERR,"Illegal package intel command");
|
||||||
iarg += 2;
|
iarg += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
// undocumented options
|
// undocumented options
|
||||||
|
|
||||||
else if (strcmp(arg[iarg],"offload_affinity_balanced") == 0) {
|
else if (strcmp(arg[iarg],"offload_affinity_balanced") == 0) {
|
||||||
@ -179,7 +179,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
|
|||||||
_real_space_comm = MPI_COMM_WORLD;
|
_real_space_comm = MPI_COMM_WORLD;
|
||||||
if (no_affinity == 0)
|
if (no_affinity == 0)
|
||||||
if (set_host_affinity(nomp) != 0)
|
if (set_host_affinity(nomp) != 0)
|
||||||
error->all(FLERR,"Could not set host affinity for offload tasks");
|
error->all(FLERR,"Could not set host affinity for offload tasks");
|
||||||
}
|
}
|
||||||
|
|
||||||
int max_offload_threads = 0, offload_cores = 0;
|
int max_offload_threads = 0, offload_cores = 0;
|
||||||
@ -264,7 +264,7 @@ FixIntel::~FixIntel()
|
|||||||
double *time2 = off_watch_neighbor();
|
double *time2 = off_watch_neighbor();
|
||||||
int *overflow = get_off_overflow_flag();
|
int *overflow = get_off_overflow_flag();
|
||||||
if (_offload_balance != 0.0 && time1 != NULL && time2 != NULL &&
|
if (_offload_balance != 0.0 && time1 != NULL && time2 != NULL &&
|
||||||
overflow != NULL) {
|
overflow != NULL) {
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
|
nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
|
||||||
}
|
}
|
||||||
@ -320,11 +320,11 @@ void FixIntel::init()
|
|||||||
if (strstr(hybrid->keywords[i], "/intel") != NULL)
|
if (strstr(hybrid->keywords[i], "/intel") != NULL)
|
||||||
nstyles++;
|
nstyles++;
|
||||||
else
|
else
|
||||||
force->pair->no_virial_fdotr_compute = 1;
|
force->pair->no_virial_fdotr_compute = 1;
|
||||||
}
|
}
|
||||||
if (nstyles > 1)
|
if (nstyles > 1)
|
||||||
error->all(FLERR,
|
error->all(FLERR,
|
||||||
"Currently, cannot use more than one intel style with hybrid.");
|
"Currently, cannot use more than one intel style with hybrid.");
|
||||||
|
|
||||||
check_neighbor_intel();
|
check_neighbor_intel();
|
||||||
int off_mode = 0;
|
int off_mode = 0;
|
||||||
@ -349,13 +349,13 @@ void FixIntel::setup(int vflag)
|
|||||||
{
|
{
|
||||||
if (neighbor->style != BIN)
|
if (neighbor->style != BIN)
|
||||||
error->all(FLERR,
|
error->all(FLERR,
|
||||||
"Currently, neighbor style BIN must be used with Intel package.");
|
"Currently, neighbor style BIN must be used with Intel package.");
|
||||||
if (neighbor->exclude_setting() != 0)
|
if (neighbor->exclude_setting() != 0)
|
||||||
error->all(FLERR,
|
error->all(FLERR,
|
||||||
"Currently, cannot use neigh_modify exclude with Intel package.");
|
"Currently, cannot use neigh_modify exclude with Intel package.");
|
||||||
if (vflag_atom)
|
if (vflag_atom)
|
||||||
error->all(FLERR,
|
error->all(FLERR,
|
||||||
"Cannot currently get per-atom virials with Intel package.");
|
"Cannot currently get per-atom virials with Intel package.");
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
post_force(vflag);
|
post_force(vflag);
|
||||||
#endif
|
#endif
|
||||||
@ -392,7 +392,7 @@ void FixIntel::pair_init_check(const bool cdmessage)
|
|||||||
double *time2 = off_watch_neighbor();
|
double *time2 = off_watch_neighbor();
|
||||||
int *overflow = get_off_overflow_flag();
|
int *overflow = get_off_overflow_flag();
|
||||||
if (_offload_balance !=0.0 && time1 != NULL && time2 != NULL &&
|
if (_offload_balance !=0.0 && time1 != NULL && time2 != NULL &&
|
||||||
overflow != NULL) {
|
overflow != NULL) {
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
|
nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
|
||||||
in(overflow:length(5) alloc_if(1) free_if(0))
|
in(overflow:length(5) alloc_if(1) free_if(0))
|
||||||
@ -407,7 +407,7 @@ void FixIntel::pair_init_check(const bool cdmessage)
|
|||||||
error->warning(FLERR, "Unknown Intel Compiler Version\n");
|
error->warning(FLERR, "Unknown Intel Compiler Version\n");
|
||||||
#else
|
#else
|
||||||
if (__INTEL_COMPILER_BUILD_DATE != 20131008 &&
|
if (__INTEL_COMPILER_BUILD_DATE != 20131008 &&
|
||||||
__INTEL_COMPILER_BUILD_DATE < 20141023)
|
__INTEL_COMPILER_BUILD_DATE < 20141023)
|
||||||
error->warning(FLERR, "Unsupported Intel Compiler.");
|
error->warning(FLERR, "Unsupported Intel Compiler.");
|
||||||
#endif
|
#endif
|
||||||
#if !defined(__INTEL_COMPILER)
|
#if !defined(__INTEL_COMPILER)
|
||||||
@ -438,24 +438,24 @@ void FixIntel::pair_init_check(const bool cdmessage)
|
|||||||
if (comm->me == 0) {
|
if (comm->me == 0) {
|
||||||
if (screen) {
|
if (screen) {
|
||||||
fprintf(screen,
|
fprintf(screen,
|
||||||
"----------------------------------------------------------\n");
|
"----------------------------------------------------------\n");
|
||||||
if (_offload_balance != 0.0) {
|
if (_offload_balance != 0.0) {
|
||||||
fprintf(screen,"Using Intel Coprocessor with %d threads per core, ",
|
fprintf(screen,"Using Intel Coprocessor with %d threads per core, ",
|
||||||
_offload_tpc);
|
_offload_tpc);
|
||||||
fprintf(screen,"%d threads per task\n",_offload_threads);
|
fprintf(screen,"%d threads per task\n",_offload_threads);
|
||||||
} else {
|
} else {
|
||||||
fprintf(screen,"Using Intel Package without Coprocessor.\n");
|
fprintf(screen,"Using Intel Package without Coprocessor.\n");
|
||||||
}
|
}
|
||||||
fprintf(screen,"Precision: %s\n",kmode);
|
fprintf(screen,"Precision: %s\n",kmode);
|
||||||
if (cdmessage) {
|
if (cdmessage) {
|
||||||
#ifdef LMP_USE_AVXCD
|
#ifdef LMP_USE_AVXCD
|
||||||
fprintf(screen,"AVX512 CD Optimizations: Enabled\n");
|
fprintf(screen,"AVX512 CD Optimizations: Enabled\n");
|
||||||
#else
|
#else
|
||||||
fprintf(screen,"AVX512 CD Optimizations: Disabled\n");
|
fprintf(screen,"AVX512 CD Optimizations: Disabled\n");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
fprintf(screen,
|
fprintf(screen,
|
||||||
"----------------------------------------------------------\n");
|
"----------------------------------------------------------\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -464,7 +464,7 @@ void FixIntel::pair_init_check(const bool cdmessage)
|
|||||||
|
|
||||||
void FixIntel::bond_init_check()
|
void FixIntel::bond_init_check()
|
||||||
{
|
{
|
||||||
if (_offload_balance != 0.0 && atom->molecular &&
|
if (_offload_balance != 0.0 && atom->molecular &&
|
||||||
force->newton_pair != force->newton_bond)
|
force->newton_pair != force->newton_bond)
|
||||||
error->all(FLERR,
|
error->all(FLERR,
|
||||||
"USER-INTEL package requires same setting for newton bond and non-bond.");
|
"USER-INTEL package requires same setting for newton bond and non-bond.");
|
||||||
@ -573,7 +573,7 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
|
|||||||
int o_range, f_stride;
|
int o_range, f_stride;
|
||||||
if (force->newton_pair)
|
if (force->newton_pair)
|
||||||
o_range = atom->nlocal + atom->nghost;
|
o_range = atom->nlocal + atom->nghost;
|
||||||
else
|
else
|
||||||
o_range = atom->nlocal;
|
o_range = atom->nlocal;
|
||||||
IP_PRE_get_stride(f_stride, o_range, (sizeof(acc_t)*4), lmp->atom->torque);
|
IP_PRE_get_stride(f_stride, o_range, (sizeof(acc_t)*4), lmp->atom->torque);
|
||||||
|
|
||||||
@ -588,18 +588,18 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
|
|||||||
_use_simd_pragma("vector aligned")
|
_use_simd_pragma("vector aligned")
|
||||||
_use_simd_pragma("simd")
|
_use_simd_pragma("simd")
|
||||||
for (int n = 0; n < o_range; n++)
|
for (int n = 0; n < o_range; n++)
|
||||||
f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
|
f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
|
||||||
} else if (_nthreads == 2) {
|
} else if (_nthreads == 2) {
|
||||||
_use_simd_pragma("vector aligned")
|
_use_simd_pragma("vector aligned")
|
||||||
_use_simd_pragma("simd")
|
_use_simd_pragma("simd")
|
||||||
for (int n = 0; n < o_range; n++)
|
for (int n = 0; n < o_range; n++)
|
||||||
f_scalar[n] += f_scalar2[n];
|
f_scalar[n] += f_scalar2[n];
|
||||||
} else {
|
} else {
|
||||||
acc_t *f_scalar3 = f_scalar2 + f_stride4;
|
acc_t *f_scalar3 = f_scalar2 + f_stride4;
|
||||||
_use_simd_pragma("vector aligned")
|
_use_simd_pragma("vector aligned")
|
||||||
_use_simd_pragma("simd")
|
_use_simd_pragma("simd")
|
||||||
for (int n = 0; n < o_range; n++)
|
for (int n = 0; n < o_range; n++)
|
||||||
f_scalar[n] += f_scalar2[n] + f_scalar3[n];
|
f_scalar[n] += f_scalar2[n] + f_scalar3[n];
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
@ -608,13 +608,13 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
|
|||||||
{
|
{
|
||||||
int iifrom, iito, tid;
|
int iifrom, iito, tid;
|
||||||
IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads,
|
IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads,
|
||||||
sizeof(acc_t));
|
sizeof(acc_t));
|
||||||
|
|
||||||
acc_t *f_scalar2 = f_scalar + f_stride4;
|
acc_t *f_scalar2 = f_scalar + f_stride4;
|
||||||
for (int t = 1; t < _nthreads; t++) {
|
for (int t = 1; t < _nthreads; t++) {
|
||||||
_use_simd_pragma("vector aligned")
|
_use_simd_pragma("vector aligned")
|
||||||
_use_simd_pragma("simd")
|
_use_simd_pragma("simd")
|
||||||
for (int n = iifrom; n < iito; n++)
|
for (int n = iifrom; n < iito; n++)
|
||||||
f_scalar[n] += f_scalar2[n];
|
f_scalar[n] += f_scalar2[n];
|
||||||
f_scalar2 += f_stride4;
|
f_scalar2 += f_stride4;
|
||||||
}
|
}
|
||||||
@ -648,33 +648,33 @@ template <class ft, class acc_t>
|
|||||||
void FixIntel::add_results(const ft * _noalias const f_in,
|
void FixIntel::add_results(const ft * _noalias const f_in,
|
||||||
const acc_t * _noalias const ev_global,
|
const acc_t * _noalias const ev_global,
|
||||||
const int eatom, const int vatom,
|
const int eatom, const int vatom,
|
||||||
const int offload) {
|
const int offload) {
|
||||||
start_watch(TIME_PACK);
|
start_watch(TIME_PACK);
|
||||||
int f_length;
|
int f_length;
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (_separate_buffers) {
|
if (_separate_buffers) {
|
||||||
if (offload) {
|
if (offload) {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
|
add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
|
||||||
const acc_t * _noalias const enull = 0;
|
const acc_t * _noalias const enull = 0;
|
||||||
int offset = _offload_nlocal;
|
int offset = _offload_nlocal;
|
||||||
if (atom->torque) offset *= 2;
|
if (atom->torque) offset *= 2;
|
||||||
add_oresults(f_in + offset, enull, eatom, vatom,
|
add_oresults(f_in + offset, enull, eatom, vatom,
|
||||||
_offload_min_ghost, _offload_nghost);
|
_offload_min_ghost, _offload_nghost);
|
||||||
} else
|
} else
|
||||||
add_oresults(f_in, ev_global, eatom, vatom, 0, offload_end_pair());
|
add_oresults(f_in, ev_global, eatom, vatom, 0, offload_end_pair());
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
add_oresults(f_in, ev_global, eatom, vatom,
|
add_oresults(f_in, ev_global, eatom, vatom,
|
||||||
_host_min_local, _host_used_local);
|
_host_min_local, _host_used_local);
|
||||||
const acc_t * _noalias const enull = 0;
|
const acc_t * _noalias const enull = 0;
|
||||||
int offset = _host_used_local;
|
int offset = _host_used_local;
|
||||||
if (atom->torque) offset *= 2;
|
if (atom->torque) offset *= 2;
|
||||||
add_oresults(f_in + offset, enull, eatom,
|
add_oresults(f_in + offset, enull, eatom,
|
||||||
vatom, _host_min_ghost, _host_used_ghost);
|
vatom, _host_min_ghost, _host_used_ghost);
|
||||||
} else {
|
} else {
|
||||||
int start = host_start_pair();
|
int start = host_start_pair();
|
||||||
add_oresults(f_in, ev_global, eatom, vatom, start, atom->nlocal-start);
|
add_oresults(f_in, ev_global, eatom, vatom, start, atom->nlocal-start);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stop_watch(TIME_PACK);
|
stop_watch(TIME_PACK);
|
||||||
@ -685,9 +685,9 @@ void FixIntel::add_results(const ft * _noalias const f_in,
|
|||||||
start = 0;
|
start = 0;
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
if (_offload_noghost == 0)
|
if (_offload_noghost == 0)
|
||||||
f_length = atom->nlocal + atom->nghost;
|
f_length = atom->nlocal + atom->nghost;
|
||||||
else
|
else
|
||||||
f_length = atom->nlocal;
|
f_length = atom->nlocal;
|
||||||
} else
|
} else
|
||||||
f_length = offload_end_pair();
|
f_length = offload_end_pair();
|
||||||
} else {
|
} else {
|
||||||
@ -714,9 +714,9 @@ void FixIntel::add_results(const ft * _noalias const f_in,
|
|||||||
|
|
||||||
template <class ft, class acc_t>
|
template <class ft, class acc_t>
|
||||||
void FixIntel::add_oresults(const ft * _noalias const f_in,
|
void FixIntel::add_oresults(const ft * _noalias const f_in,
|
||||||
const acc_t * _noalias const ev_global,
|
const acc_t * _noalias const ev_global,
|
||||||
const int eatom, const int vatom,
|
const int eatom, const int vatom,
|
||||||
const int out_offset, const int nall) {
|
const int out_offset, const int nall) {
|
||||||
lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
|
lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
|
||||||
if (atom->torque) {
|
if (atom->torque) {
|
||||||
if (f_in[1].w)
|
if (f_in[1].w)
|
||||||
@ -744,12 +744,12 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
|
|||||||
if (atom->torque) {
|
if (atom->torque) {
|
||||||
int ii = ifrom * 2;
|
int ii = ifrom * 2;
|
||||||
lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
|
lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
|
||||||
out_offset;
|
out_offset;
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
|
double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma novector
|
#pragma novector
|
||||||
#endif
|
#endif
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
f[i].x += f_in[ii].x;
|
f[i].x += f_in[ii].x;
|
||||||
f[i].y += f_in[ii].y;
|
f[i].y += f_in[ii].y;
|
||||||
@ -762,8 +762,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma novector
|
#pragma novector
|
||||||
#endif
|
#endif
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
f[i].x += f_in[ii].x;
|
f[i].x += f_in[ii].x;
|
||||||
f[i].y += f_in[ii].y;
|
f[i].y += f_in[ii].y;
|
||||||
@ -776,10 +776,10 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
|
double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma novector
|
#pragma novector
|
||||||
#endif
|
#endif
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
f[i].x += f_in[i].x;
|
f[i].x += f_in[i].x;
|
||||||
f[i].y += f_in[i].y;
|
f[i].y += f_in[i].y;
|
||||||
@ -788,8 +788,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma novector
|
#pragma novector
|
||||||
#endif
|
#endif
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
f[i].x += f_in[i].x;
|
f[i].x += f_in[i].x;
|
||||||
f[i].y += f_in[i].y;
|
f[i].y += f_in[i].y;
|
||||||
@ -931,7 +931,7 @@ void FixIntel::output_timing_data() {
|
|||||||
balance_out[0] = _balance_pair;
|
balance_out[0] = _balance_pair;
|
||||||
balance_out[1] = _balance_neighbor;
|
balance_out[1] = _balance_neighbor;
|
||||||
MPI_Reduce(balance_out, balance_in, 2, MPI_DOUBLE, MPI_SUM,
|
MPI_Reduce(balance_out, balance_in, 2, MPI_DOUBLE, MPI_SUM,
|
||||||
0, _real_space_comm);
|
0, _real_space_comm);
|
||||||
balance_in[0] /= size;
|
balance_in[0] /= size;
|
||||||
balance_in[1] /= size;
|
balance_in[1] /= size;
|
||||||
|
|
||||||
@ -958,25 +958,25 @@ void FixIntel::output_timing_data() {
|
|||||||
balance_in[1]);
|
balance_in[1]);
|
||||||
fprintf(_tscreen, " Offload Pair Balance %f\n",
|
fprintf(_tscreen, " Offload Pair Balance %f\n",
|
||||||
balance_in[0]);
|
balance_in[0]);
|
||||||
fprintf(_tscreen, " Offload Ghost Atoms ");
|
fprintf(_tscreen, " Offload Ghost Atoms ");
|
||||||
if (_offload_noghost) fprintf(_tscreen,"No\n");
|
if (_offload_noghost) fprintf(_tscreen,"No\n");
|
||||||
else fprintf(_tscreen,"Yes\n");
|
else fprintf(_tscreen,"Yes\n");
|
||||||
#ifdef TIME_BALANCE
|
#ifdef TIME_BALANCE
|
||||||
fprintf(_tscreen, " Offload Imbalance Seconds %f\n",
|
fprintf(_tscreen, " Offload Imbalance Seconds %f\n",
|
||||||
timers[TIME_IMBALANCE]);
|
timers[TIME_IMBALANCE]);
|
||||||
fprintf(_tscreen, " Offload Min/Max Seconds ");
|
fprintf(_tscreen, " Offload Min/Max Seconds ");
|
||||||
for (int i = 0; i < NUM_ITIMERS; i++)
|
for (int i = 0; i < NUM_ITIMERS; i++)
|
||||||
fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]);
|
fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]);
|
||||||
fprintf(_tscreen, "\n");
|
fprintf(_tscreen, "\n");
|
||||||
#endif
|
#endif
|
||||||
double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] +
|
double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] +
|
||||||
timers[TIME_OFFLOAD_WAIT];
|
timers[TIME_OFFLOAD_WAIT];
|
||||||
double ct = timers[TIME_OFFLOAD_NEIGHBOR] +
|
double ct = timers[TIME_OFFLOAD_NEIGHBOR] +
|
||||||
timers[TIME_OFFLOAD_PAIR];
|
timers[TIME_OFFLOAD_PAIR];
|
||||||
double tt = MAX(ht,ct);
|
double tt = MAX(ht,ct);
|
||||||
if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0)
|
if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0)
|
||||||
error->warning(FLERR,
|
error->warning(FLERR,
|
||||||
"Leaving a core free can improve performance for offload");
|
"Leaving a core free can improve performance for offload");
|
||||||
}
|
}
|
||||||
fprintf(_tscreen, "------------------------------------------------\n");
|
fprintf(_tscreen, "------------------------------------------------\n");
|
||||||
}
|
}
|
||||||
@ -999,14 +999,14 @@ int FixIntel::get_ppn(int &node_rank) {
|
|||||||
node_name[name_length] = '\0';
|
node_name[name_length] = '\0';
|
||||||
char *node_names = new char[MPI_MAX_PROCESSOR_NAME*nprocs];
|
char *node_names = new char[MPI_MAX_PROCESSOR_NAME*nprocs];
|
||||||
MPI_Allgather(node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, node_names,
|
MPI_Allgather(node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, node_names,
|
||||||
MPI_MAX_PROCESSOR_NAME, MPI_CHAR, _real_space_comm);
|
MPI_MAX_PROCESSOR_NAME, MPI_CHAR, _real_space_comm);
|
||||||
int ppn = 0;
|
int ppn = 0;
|
||||||
node_rank = 0;
|
node_rank = 0;
|
||||||
for (int i = 0; i < nprocs; i++) {
|
for (int i = 0; i < nprocs; i++) {
|
||||||
if (strcmp(node_name, node_names + i * MPI_MAX_PROCESSOR_NAME) == 0) {
|
if (strcmp(node_name, node_names + i * MPI_MAX_PROCESSOR_NAME) == 0) {
|
||||||
ppn++;
|
ppn++;
|
||||||
if (i < rank)
|
if (i < rank)
|
||||||
node_rank++;
|
node_rank++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1068,19 +1068,19 @@ void FixIntel::set_offload_affinity()
|
|||||||
kmp_create_affinity_mask(&mask);
|
kmp_create_affinity_mask(&mask);
|
||||||
int proc = offload_threads * node_rank + tnum;
|
int proc = offload_threads * node_rank + tnum;
|
||||||
#ifdef __AVX512F__
|
#ifdef __AVX512F__
|
||||||
proc = (proc / offload_tpc) + (proc % offload_tpc) *
|
proc = (proc / offload_tpc) + (proc % offload_tpc) *
|
||||||
((offload_cores) / 4);
|
((offload_cores) / 4);
|
||||||
proc += 68;
|
proc += 68;
|
||||||
#else
|
#else
|
||||||
if (offload_affinity_balanced)
|
if (offload_affinity_balanced)
|
||||||
proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1;
|
proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1;
|
||||||
else
|
else
|
||||||
proc += (proc / 4) * (4 - offload_tpc) + 1;
|
proc += (proc / 4) * (4 - offload_tpc) + 1;
|
||||||
#endif
|
#endif
|
||||||
kmp_set_affinity_mask_proc(proc, &mask);
|
kmp_set_affinity_mask_proc(proc, &mask);
|
||||||
if (kmp_set_affinity(&mask) != 0)
|
if (kmp_set_affinity(&mask) != 0)
|
||||||
printf("Could not set affinity on rank %d thread %d to %d\n",
|
printf("Could not set affinity on rank %d thread %d to %d\n",
|
||||||
node_rank, tnum, proc);
|
node_rank, tnum, proc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1110,7 +1110,7 @@ int FixIntel::set_host_affinity(const int nomp)
|
|||||||
char cmd[512];
|
char cmd[512];
|
||||||
char readbuf[INTEL_MAX_HOST_CORE_COUNT*5];
|
char readbuf[INTEL_MAX_HOST_CORE_COUNT*5];
|
||||||
sprintf(cmd, "lscpu -p | grep -v '#' |"
|
sprintf(cmd, "lscpu -p | grep -v '#' |"
|
||||||
"sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'");
|
"sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'");
|
||||||
p = popen(cmd, "r");
|
p = popen(cmd, "r");
|
||||||
if (p == NULL) return -1;
|
if (p == NULL) return -1;
|
||||||
ncores = 0;
|
ncores = 0;
|
||||||
@ -1147,7 +1147,7 @@ int FixIntel::set_host_affinity(const int nomp)
|
|||||||
if (subscription > ncores) {
|
if (subscription > ncores) {
|
||||||
if (rank == 0)
|
if (rank == 0)
|
||||||
error->warning(FLERR,
|
error->warning(FLERR,
|
||||||
"More MPI tasks/OpenMP threads than available cores");
|
"More MPI tasks/OpenMP threads than available cores");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
if (subscription == ncores)
|
if (subscription == ncores)
|
||||||
@ -1173,10 +1173,10 @@ int FixIntel::set_host_affinity(const int nomp)
|
|||||||
int first = coi_cores + node_rank * mpi_cores;
|
int first = coi_cores + node_rank * mpi_cores;
|
||||||
CPU_ZERO(&cpuset);
|
CPU_ZERO(&cpuset);
|
||||||
for (int i = first; i < first + mpi_cores; i++)
|
for (int i = first; i < first + mpi_cores; i++)
|
||||||
CPU_SET(proc_list[i], &cpuset);
|
CPU_SET(proc_list[i], &cpuset);
|
||||||
if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
|
if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
|
||||||
fail = 1;
|
fail = 1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
plwp++;
|
plwp++;
|
||||||
}
|
}
|
||||||
@ -1189,13 +1189,13 @@ int FixIntel::set_host_affinity(const int nomp)
|
|||||||
buf1 = (float*) malloc(sizeof(float)*pragma_size);
|
buf1 = (float*) malloc(sizeof(float)*pragma_size);
|
||||||
|
|
||||||
#pragma offload target (mic:0) mandatory \
|
#pragma offload target (mic:0) mandatory \
|
||||||
in(buf1:length(pragma_size) alloc_if(1) free_if(0)) \
|
in(buf1:length(pragma_size) alloc_if(1) free_if(0)) \
|
||||||
signal(&sig1)
|
signal(&sig1)
|
||||||
{ buf1[0] = 0.0; }
|
{ buf1[0] = 0.0; }
|
||||||
#pragma offload_wait target(mic:0) wait(&sig1)
|
#pragma offload_wait target(mic:0) wait(&sig1)
|
||||||
|
|
||||||
#pragma offload target (mic:0) mandatory \
|
#pragma offload target (mic:0) mandatory \
|
||||||
out(buf1:length(pragma_size) alloc_if(0) free_if(1)) \
|
out(buf1:length(pragma_size) alloc_if(0) free_if(1)) \
|
||||||
signal(&sig2)
|
signal(&sig2)
|
||||||
{ buf1[0] = 1.0; }
|
{ buf1[0] = 1.0; }
|
||||||
#pragma offload_wait target(mic:0) wait(&sig2)
|
#pragma offload_wait target(mic:0) wait(&sig2)
|
||||||
@ -1211,11 +1211,11 @@ int FixIntel::set_host_affinity(const int nomp)
|
|||||||
|
|
||||||
CPU_ZERO(&cpuset);
|
CPU_ZERO(&cpuset);
|
||||||
for(int i=0; i<coi_cores; i++)
|
for(int i=0; i<coi_cores; i++)
|
||||||
CPU_SET(proc_list[i], &cpuset);
|
CPU_SET(proc_list[i], &cpuset);
|
||||||
|
|
||||||
if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
|
if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
|
||||||
fail = 1;
|
fail = 1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pclose(p);
|
pclose(p);
|
||||||
@ -1228,7 +1228,7 @@ int FixIntel::set_host_affinity(const int nomp)
|
|||||||
if (screen && rank == 0) {
|
if (screen && rank == 0) {
|
||||||
if (coi_cores)
|
if (coi_cores)
|
||||||
fprintf(screen,"Intel Package: Affinitizing %d Offload Threads to %d Cores\n",
|
fprintf(screen,"Intel Package: Affinitizing %d Offload Threads to %d Cores\n",
|
||||||
mlwp, coi_cores);
|
mlwp, coi_cores);
|
||||||
fprintf(screen,"Intel Package: Affinitizing MPI Tasks to %d Cores Each\n",mpi_cores);
|
fprintf(screen,"Intel Package: Affinitizing MPI Tasks to %d Cores Each\n",mpi_cores);
|
||||||
}
|
}
|
||||||
if (fail) return -1;
|
if (fail) return -1;
|
||||||
|
|||||||
@ -72,7 +72,7 @@ class FixIntel : public Fix {
|
|||||||
inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
|
inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
|
||||||
inline int three_body_neighbor() { return _three_body_neighbor; }
|
inline int three_body_neighbor() { return _three_body_neighbor; }
|
||||||
inline void three_body_neighbor(const int i) { _three_body_neighbor = 1; }
|
inline void three_body_neighbor(const int i) { _three_body_neighbor = 1; }
|
||||||
|
|
||||||
inline int need_zero(const int tid) {
|
inline int need_zero(const int tid) {
|
||||||
if (_need_reduce == 0 && tid > 0) return 1;
|
if (_need_reduce == 0 && tid > 0) return 1;
|
||||||
return 0;
|
return 0;
|
||||||
@ -84,11 +84,11 @@ class FixIntel : public Fix {
|
|||||||
}
|
}
|
||||||
inline int pppm_table() {
|
inline int pppm_table() {
|
||||||
if (force->kspace_match("pppm/intel", 0) ||
|
if (force->kspace_match("pppm/intel", 0) ||
|
||||||
force->kspace_match("pppm/disp/intel",0))
|
force->kspace_match("pppm/disp/intel",0))
|
||||||
return INTEL_P3M_TABLE;
|
return INTEL_P3M_TABLE;
|
||||||
else return 0;
|
else return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
IntelBuffers<float,float> *_single_buffers;
|
IntelBuffers<float,float> *_single_buffers;
|
||||||
@ -103,17 +103,17 @@ class FixIntel : public Fix {
|
|||||||
inline void add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
|
inline void add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
|
||||||
double *ev_in, const int offload,
|
double *ev_in, const int offload,
|
||||||
const int eatom = 0, const int vatom = 0,
|
const int eatom = 0, const int vatom = 0,
|
||||||
const int rflag = 0);
|
const int rflag = 0);
|
||||||
inline void add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
|
inline void add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
|
||||||
double *ev_in, const int offload,
|
double *ev_in, const int offload,
|
||||||
const int eatom = 0, const int vatom = 0,
|
const int eatom = 0, const int vatom = 0,
|
||||||
const int rflag = 0);
|
const int rflag = 0);
|
||||||
inline void add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
|
inline void add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
|
||||||
float *ev_in, const int offload,
|
float *ev_in, const int offload,
|
||||||
const int eatom = 0, const int vatom = 0,
|
const int eatom = 0, const int vatom = 0,
|
||||||
const int rflag = 0);
|
const int rflag = 0);
|
||||||
inline void get_buffern(const int offload, int &nlocal, int &nall,
|
inline void get_buffern(const int offload, int &nlocal, int &nall,
|
||||||
int &minlocal);
|
int &minlocal);
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
void post_force(int vflag);
|
void post_force(int vflag);
|
||||||
@ -213,13 +213,13 @@ class FixIntel : public Fix {
|
|||||||
inline void add_results(const ft * _noalias const f_in,
|
inline void add_results(const ft * _noalias const f_in,
|
||||||
const acc_t * _noalias const ev_global,
|
const acc_t * _noalias const ev_global,
|
||||||
const int eatom, const int vatom,
|
const int eatom, const int vatom,
|
||||||
const int offload);
|
const int offload);
|
||||||
|
|
||||||
template <class ft, class acc_t>
|
template <class ft, class acc_t>
|
||||||
inline void add_oresults(const ft * _noalias const f_in,
|
inline void add_oresults(const ft * _noalias const f_in,
|
||||||
const acc_t * _noalias const ev_global,
|
const acc_t * _noalias const ev_global,
|
||||||
const int eatom, const int vatom,
|
const int eatom, const int vatom,
|
||||||
const int out_offset, const int nall);
|
const int out_offset, const int nall);
|
||||||
|
|
||||||
int _offload_affinity_balanced, _offload_threads, _offload_tpc;
|
int _offload_affinity_balanced, _offload_threads, _offload_tpc;
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
@ -235,16 +235,16 @@ class FixIntel : public Fix {
|
|||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
|
void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
|
||||||
int &minlocal) {
|
int &minlocal) {
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (_separate_buffers) {
|
if (_separate_buffers) {
|
||||||
if (offload) {
|
if (offload) {
|
||||||
if (neighbor->ago != 0) {
|
if (neighbor->ago != 0) {
|
||||||
nlocal = _offload_nlocal;
|
nlocal = _offload_nlocal;
|
||||||
nall = _offload_nall;
|
nall = _offload_nall;
|
||||||
} else {
|
} else {
|
||||||
nlocal = atom->nlocal;
|
nlocal = atom->nlocal;
|
||||||
nall = nlocal + atom->nghost;
|
nall = nlocal + atom->nghost;
|
||||||
}
|
}
|
||||||
minlocal = 0;
|
minlocal = 0;
|
||||||
} else {
|
} else {
|
||||||
@ -253,7 +253,7 @@ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
|
|||||||
if (force->newton)
|
if (force->newton)
|
||||||
minlocal = _host_min_local;
|
minlocal = _host_min_local;
|
||||||
else
|
else
|
||||||
minlocal = host_start_pair();
|
minlocal = host_start_pair();
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -271,7 +271,7 @@ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
|
|||||||
void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
|
void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
|
||||||
double *ev_in, const int offload,
|
double *ev_in, const int offload,
|
||||||
const int eatom, const int vatom,
|
const int eatom, const int vatom,
|
||||||
const int rflag) {
|
const int rflag) {
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (offload) {
|
if (offload) {
|
||||||
_off_results_eatom = eatom;
|
_off_results_eatom = eatom;
|
||||||
@ -299,7 +299,7 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
|
|||||||
void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
|
void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
|
||||||
double *ev_in, const int offload,
|
double *ev_in, const int offload,
|
||||||
const int eatom, const int vatom,
|
const int eatom, const int vatom,
|
||||||
const int rflag) {
|
const int rflag) {
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (offload) {
|
if (offload) {
|
||||||
_off_results_eatom = eatom;
|
_off_results_eatom = eatom;
|
||||||
@ -361,12 +361,12 @@ int FixIntel::offload_end_neighbor() {
|
|||||||
if (atom->nlocal < 2)
|
if (atom->nlocal < 2)
|
||||||
error->one(FLERR,"Too few atoms for load balancing offload");
|
error->one(FLERR,"Too few atoms for load balancing offload");
|
||||||
double granularity = 1.0 / atom->nlocal;
|
double granularity = 1.0 / atom->nlocal;
|
||||||
if (_balance_neighbor < granularity)
|
if (_balance_neighbor < granularity)
|
||||||
_balance_neighbor = granularity + 1e-10;
|
_balance_neighbor = granularity + 1e-10;
|
||||||
else if (_balance_neighbor > 1.0 - granularity)
|
else if (_balance_neighbor > 1.0 - granularity)
|
||||||
_balance_neighbor = 1.0 - granularity + 1e-10;
|
_balance_neighbor = 1.0 - granularity + 1e-10;
|
||||||
}
|
}
|
||||||
return _balance_neighbor * atom->nlocal;
|
return _balance_neighbor * atom->nlocal;
|
||||||
}
|
}
|
||||||
|
|
||||||
int FixIntel::offload_end_pair() {
|
int FixIntel::offload_end_pair() {
|
||||||
@ -517,7 +517,7 @@ The newton setting must be the same for both pairwise and bonded forces.
|
|||||||
|
|
||||||
E: Intel styles for bond/angle/dihedral/improper require intel pair style."
|
E: Intel styles for bond/angle/dihedral/improper require intel pair style."
|
||||||
|
|
||||||
You cannot use the USER-INTEL package for bond calculations without a
|
You cannot use the USER-INTEL package for bond calculations without a
|
||||||
USER-INTEL supported pair style.
|
USER-INTEL supported pair style.
|
||||||
|
|
||||||
E: Intel styles for kspace require intel pair style.
|
E: Intel styles for kspace require intel pair style.
|
||||||
|
|||||||
@ -45,7 +45,7 @@ typedef struct { double x,y,z; } dbl3_t;
|
|||||||
NVT,NPH,NPT integrators for improved Nose-Hoover equations of motion
|
NVT,NPH,NPT integrators for improved Nose-Hoover equations of motion
|
||||||
---------------------------------------------------------------------- */
|
---------------------------------------------------------------------- */
|
||||||
|
|
||||||
FixNHIntel::FixNHIntel(LAMMPS *lmp, int narg, char **arg) :
|
FixNHIntel::FixNHIntel(LAMMPS *lmp, int narg, char **arg) :
|
||||||
FixNH(lmp, narg, arg)
|
FixNH(lmp, narg, arg)
|
||||||
{
|
{
|
||||||
_dtfm = 0;
|
_dtfm = 0;
|
||||||
@ -118,12 +118,12 @@ void FixNHIntel::remap()
|
|||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
if (mask[i] & dilate_group_bit) {
|
if (mask[i] & dilate_group_bit) {
|
||||||
const double d0 = x[i].x - b0;
|
const double d0 = x[i].x - b0;
|
||||||
const double d1 = x[i].y - b1;
|
const double d1 = x[i].y - b1;
|
||||||
const double d2 = x[i].z - b2;
|
const double d2 = x[i].z - b2;
|
||||||
x[i].x = hi0*d0 + hi5*d1 + hi4*d2;
|
x[i].x = hi0*d0 + hi5*d1 + hi4*d2;
|
||||||
x[i].y = hi1*d1 + hi3*d2;
|
x[i].y = hi1*d1 + hi3*d2;
|
||||||
x[i].z = hi2*d2;
|
x[i].z = hi2*d2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -294,9 +294,9 @@ void FixNHIntel::remap()
|
|||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
if (mask[i] & dilate_group_bit) {
|
if (mask[i] & dilate_group_bit) {
|
||||||
x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
|
x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
|
||||||
x[i].y = h1*x[i].y + h3*x[i].z + nb1;
|
x[i].y = h1*x[i].y + h3*x[i].z + nb1;
|
||||||
x[i].z = h2*x[i].z + nb2;
|
x[i].z = h2*x[i].z + nb2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -318,7 +318,7 @@ void FixNHIntel::reset_dt()
|
|||||||
dto = dthalf;
|
dto = dthalf;
|
||||||
|
|
||||||
// If using respa, then remap is performed in innermost level
|
// If using respa, then remap is performed in innermost level
|
||||||
|
|
||||||
if (strstr(update->integrate_style,"respa"))
|
if (strstr(update->integrate_style,"respa"))
|
||||||
dto = 0.5*step_respa[0];
|
dto = 0.5*step_respa[0];
|
||||||
|
|
||||||
@ -329,7 +329,7 @@ void FixNHIntel::reset_dt()
|
|||||||
tdrag_factor = 1.0 - (update->dt * t_freq * drag / nc_tchain);
|
tdrag_factor = 1.0 - (update->dt * t_freq * drag / nc_tchain);
|
||||||
|
|
||||||
const int * const mask = atom->mask;
|
const int * const mask = atom->mask;
|
||||||
const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
|
const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
|
||||||
atom->nlocal;
|
atom->nlocal;
|
||||||
|
|
||||||
if (nlocal > _nlocal_max) {
|
if (nlocal > _nlocal_max) {
|
||||||
@ -345,9 +345,9 @@ void FixNHIntel::reset_dt()
|
|||||||
const double * const rmass = atom->rmass;
|
const double * const rmass = atom->rmass;
|
||||||
int n = 0;
|
int n = 0;
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
_dtfm[n++] = dtf / rmass[i];
|
_dtfm[n++] = dtf / rmass[i];
|
||||||
_dtfm[n++] = dtf / rmass[i];
|
_dtfm[n++] = dtf / rmass[i];
|
||||||
_dtfm[n++] = dtf / rmass[i];
|
_dtfm[n++] = dtf / rmass[i];
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
const double * const mass = atom->mass;
|
const double * const mass = atom->mass;
|
||||||
@ -364,29 +364,29 @@ void FixNHIntel::reset_dt()
|
|||||||
const double * const rmass = atom->rmass;
|
const double * const rmass = atom->rmass;
|
||||||
int n = 0;
|
int n = 0;
|
||||||
for (int i = 0; i < nlocal; i++)
|
for (int i = 0; i < nlocal; i++)
|
||||||
if (mask[i] & groupbit) {
|
if (mask[i] & groupbit) {
|
||||||
_dtfm[n++] = dtf / rmass[i];
|
_dtfm[n++] = dtf / rmass[i];
|
||||||
_dtfm[n++] = dtf / rmass[i];
|
_dtfm[n++] = dtf / rmass[i];
|
||||||
_dtfm[n++] = dtf / rmass[i];
|
_dtfm[n++] = dtf / rmass[i];
|
||||||
} else {
|
} else {
|
||||||
_dtfm[n++] = 0.0;
|
_dtfm[n++] = 0.0;
|
||||||
_dtfm[n++] = 0.0;
|
_dtfm[n++] = 0.0;
|
||||||
_dtfm[n++] = 0.0;
|
_dtfm[n++] = 0.0;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
const double * const mass = atom->mass;
|
const double * const mass = atom->mass;
|
||||||
const int * const type = atom->type;
|
const int * const type = atom->type;
|
||||||
int n = 0;
|
int n = 0;
|
||||||
for (int i = 0; i < nlocal; i++)
|
for (int i = 0; i < nlocal; i++)
|
||||||
if (mask[i] & groupbit) {
|
if (mask[i] & groupbit) {
|
||||||
_dtfm[n++] = dtf / mass[type[i]];
|
_dtfm[n++] = dtf / mass[type[i]];
|
||||||
_dtfm[n++] = dtf / mass[type[i]];
|
_dtfm[n++] = dtf / mass[type[i]];
|
||||||
_dtfm[n++] = dtf / mass[type[i]];
|
_dtfm[n++] = dtf / mass[type[i]];
|
||||||
} else {
|
} else {
|
||||||
_dtfm[n++] = 0.0;
|
_dtfm[n++] = 0.0;
|
||||||
_dtfm[n++] = 0.0;
|
_dtfm[n++] = 0.0;
|
||||||
_dtfm[n++] = 0.0;
|
_dtfm[n++] = 0.0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -431,9 +431,9 @@ void FixNHIntel::nh_v_press()
|
|||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
if (mask[i] & groupbit) {
|
if (mask[i] & groupbit) {
|
||||||
v[i].x *= f0;
|
v[i].x *= f0;
|
||||||
v[i].y *= f1;
|
v[i].y *= f1;
|
||||||
v[i].z *= f2;
|
v[i].z *= f2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -506,7 +506,7 @@ void FixNHIntel::nh_v_temp()
|
|||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < _nlocal3; i++)
|
for (int i = 0; i < _nlocal3; i++)
|
||||||
v[i] *= factor_eta;
|
v[i] *= factor_eta;
|
||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
@ -514,12 +514,12 @@ void FixNHIntel::nh_v_temp()
|
|||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < _nlocal3; i++) {
|
for (int i = 0; i < _nlocal3; i++) {
|
||||||
if (_dtfm[i] != 0.0)
|
if (_dtfm[i] != 0.0)
|
||||||
v[i] *= factor_eta;
|
v[i] *= factor_eta;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
double FixNHIntel::memory_usage()
|
double FixNHIntel::memory_usage()
|
||||||
{
|
{
|
||||||
return FixNH::memory_usage() + _nlocal_max * 3 * sizeof(double);
|
return FixNH::memory_usage() + _nlocal_max * 3 * sizeof(double);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -35,7 +35,7 @@ class FixNHIntel : public FixNH {
|
|||||||
int _nlocal3, _nlocal_max;
|
int _nlocal3, _nlocal_max;
|
||||||
|
|
||||||
virtual void remap();
|
virtual void remap();
|
||||||
virtual void nve_x();
|
virtual void nve_x();
|
||||||
virtual void nve_v();
|
virtual void nve_v();
|
||||||
virtual void nh_v_press();
|
virtual void nh_v_press();
|
||||||
virtual void nh_v_temp();
|
virtual void nh_v_temp();
|
||||||
|
|||||||
@ -36,7 +36,7 @@ using namespace FixConst;
|
|||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
FixNVEAsphereIntel::FixNVEAsphereIntel(LAMMPS *lmp, int narg, char **arg) :
|
FixNVEAsphereIntel::FixNVEAsphereIntel(LAMMPS *lmp, int narg, char **arg) :
|
||||||
FixNVE(lmp, narg, arg)
|
FixNVE(lmp, narg, arg)
|
||||||
{
|
{
|
||||||
_dtfm = 0;
|
_dtfm = 0;
|
||||||
_nlocal3 = 0;
|
_nlocal3 = 0;
|
||||||
@ -129,9 +129,9 @@ void FixNVEAsphereIntel::initial_integrate(int vflag)
|
|||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
if (mask[i] & groupbit) {
|
if (mask[i] & groupbit) {
|
||||||
double *quat = bonus[ellipsoid[i]].quat;
|
double *quat = bonus[ellipsoid[i]].quat;
|
||||||
ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i],
|
ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i],
|
||||||
_inertia1[i], _inertia2[i]);
|
_inertia1[i], _inertia2[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -168,7 +168,7 @@ void FixNVEAsphereIntel::reset_dt() {
|
|||||||
dtf = 0.5 * update->dt * force->ftm2v;
|
dtf = 0.5 * update->dt * force->ftm2v;
|
||||||
|
|
||||||
const int * const mask = atom->mask;
|
const int * const mask = atom->mask;
|
||||||
const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
|
const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
|
||||||
atom->nlocal;
|
atom->nlocal;
|
||||||
|
|
||||||
if (nlocal > _nlocal_max) {
|
if (nlocal > _nlocal_max) {
|
||||||
@ -211,27 +211,27 @@ void FixNVEAsphereIntel::reset_dt() {
|
|||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
if (mask[i] & groupbit) {
|
if (mask[i] & groupbit) {
|
||||||
_dtfm[n++] = dtf / rmass[i];
|
_dtfm[n++] = dtf / rmass[i];
|
||||||
_dtfm[n++] = dtf / rmass[i];
|
_dtfm[n++] = dtf / rmass[i];
|
||||||
_dtfm[n++] = dtf / rmass[i];
|
_dtfm[n++] = dtf / rmass[i];
|
||||||
double *shape = bonus[ellipsoid[i]].shape;
|
double *shape = bonus[ellipsoid[i]].shape;
|
||||||
double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
|
double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
|
||||||
if (idot != 0.0) idot = 1.0 / idot;
|
if (idot != 0.0) idot = 1.0 / idot;
|
||||||
_inertia0[i] = idot;
|
_inertia0[i] = idot;
|
||||||
idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
|
idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
|
||||||
if (idot != 0.0) idot = 1.0 / idot;
|
if (idot != 0.0) idot = 1.0 / idot;
|
||||||
_inertia1[i] = idot;
|
_inertia1[i] = idot;
|
||||||
idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
|
idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
|
||||||
if (idot != 0.0) idot = 1.0 / idot;
|
if (idot != 0.0) idot = 1.0 / idot;
|
||||||
_inertia2[i] = idot;
|
_inertia2[i] = idot;
|
||||||
} else {
|
} else {
|
||||||
_dtfm[n++] = 0.0;
|
_dtfm[n++] = 0.0;
|
||||||
_dtfm[n++] = 0.0;
|
_dtfm[n++] = 0.0;
|
||||||
_dtfm[n++] = 0.0;
|
_dtfm[n++] = 0.0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
double FixNVEAsphereIntel::memory_usage()
|
double FixNVEAsphereIntel::memory_usage()
|
||||||
{
|
{
|
||||||
return FixNVE::memory_usage() + _nlocal_max * 12 * sizeof(double);
|
return FixNVE::memory_usage() + _nlocal_max * 12 * sizeof(double);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,7 +29,7 @@ using namespace FixConst;
|
|||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
FixNVEIntel::FixNVEIntel(LAMMPS *lmp, int narg, char **arg) :
|
FixNVEIntel::FixNVEIntel(LAMMPS *lmp, int narg, char **arg) :
|
||||||
FixNVE(lmp, narg, arg)
|
FixNVE(lmp, narg, arg)
|
||||||
{
|
{
|
||||||
_dtfm = 0;
|
_dtfm = 0;
|
||||||
_nlocal3 = 0;
|
_nlocal3 = 0;
|
||||||
@ -91,7 +91,7 @@ void FixNVEIntel::initial_integrate(int vflag)
|
|||||||
for (int i = 0; i < _nlocal3; i++) {
|
for (int i = 0; i < _nlocal3; i++) {
|
||||||
if (_dtfm[i] != 0.0) {
|
if (_dtfm[i] != 0.0) {
|
||||||
v[i] += _dtfm[i] * f[i];
|
v[i] += _dtfm[i] * f[i];
|
||||||
x[i] += dtv * v[i];
|
x[i] += dtv * v[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -130,7 +130,7 @@ void FixNVEIntel::reset_dt() {
|
|||||||
dtf = 0.5 * update->dt * force->ftm2v;
|
dtf = 0.5 * update->dt * force->ftm2v;
|
||||||
|
|
||||||
const int * const mask = atom->mask;
|
const int * const mask = atom->mask;
|
||||||
const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
|
const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
|
||||||
atom->nlocal;
|
atom->nlocal;
|
||||||
|
|
||||||
if (nlocal > _nlocal_max) {
|
if (nlocal > _nlocal_max) {
|
||||||
@ -146,9 +146,9 @@ void FixNVEIntel::reset_dt() {
|
|||||||
const double * const rmass = atom->rmass;
|
const double * const rmass = atom->rmass;
|
||||||
int n = 0;
|
int n = 0;
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
_dtfm[n++] = dtf / rmass[i];
|
_dtfm[n++] = dtf / rmass[i];
|
||||||
_dtfm[n++] = dtf / rmass[i];
|
_dtfm[n++] = dtf / rmass[i];
|
||||||
_dtfm[n++] = dtf / rmass[i];
|
_dtfm[n++] = dtf / rmass[i];
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
const double * const mass = atom->mass;
|
const double * const mass = atom->mass;
|
||||||
@ -165,34 +165,34 @@ void FixNVEIntel::reset_dt() {
|
|||||||
const double * const rmass = atom->rmass;
|
const double * const rmass = atom->rmass;
|
||||||
int n = 0;
|
int n = 0;
|
||||||
for (int i = 0; i < nlocal; i++)
|
for (int i = 0; i < nlocal; i++)
|
||||||
if (mask[i] & groupbit) {
|
if (mask[i] & groupbit) {
|
||||||
_dtfm[n++] = dtf / rmass[i];
|
_dtfm[n++] = dtf / rmass[i];
|
||||||
_dtfm[n++] = dtf / rmass[i];
|
_dtfm[n++] = dtf / rmass[i];
|
||||||
_dtfm[n++] = dtf / rmass[i];
|
_dtfm[n++] = dtf / rmass[i];
|
||||||
} else {
|
} else {
|
||||||
_dtfm[n++] = 0.0;
|
_dtfm[n++] = 0.0;
|
||||||
_dtfm[n++] = 0.0;
|
_dtfm[n++] = 0.0;
|
||||||
_dtfm[n++] = 0.0;
|
_dtfm[n++] = 0.0;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
const double * const mass = atom->mass;
|
const double * const mass = atom->mass;
|
||||||
const int * const type = atom->type;
|
const int * const type = atom->type;
|
||||||
int n = 0;
|
int n = 0;
|
||||||
for (int i = 0; i < nlocal; i++)
|
for (int i = 0; i < nlocal; i++)
|
||||||
if (mask[i] & groupbit) {
|
if (mask[i] & groupbit) {
|
||||||
_dtfm[n++] = dtf / mass[type[i]];
|
_dtfm[n++] = dtf / mass[type[i]];
|
||||||
_dtfm[n++] = dtf / mass[type[i]];
|
_dtfm[n++] = dtf / mass[type[i]];
|
||||||
_dtfm[n++] = dtf / mass[type[i]];
|
_dtfm[n++] = dtf / mass[type[i]];
|
||||||
} else {
|
} else {
|
||||||
_dtfm[n++] = 0.0;
|
_dtfm[n++] = 0.0;
|
||||||
_dtfm[n++] = 0.0;
|
_dtfm[n++] = 0.0;
|
||||||
_dtfm[n++] = 0.0;
|
_dtfm[n++] = 0.0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
double FixNVEIntel::memory_usage()
|
double FixNVEIntel::memory_usage()
|
||||||
{
|
{
|
||||||
return FixNVE::memory_usage() + _nlocal_max * 3 * sizeof(double);
|
return FixNVE::memory_usage() + _nlocal_max * 3 * sizeof(double);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -42,7 +42,7 @@ typedef struct { int a,b,c,d,t; } int5_t;
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
ImproperCvffIntel::ImproperCvffIntel(LAMMPS *lmp) :
|
ImproperCvffIntel::ImproperCvffIntel(LAMMPS *lmp) :
|
||||||
ImproperCvff(lmp)
|
ImproperCvff(lmp)
|
||||||
{
|
{
|
||||||
suffix_flag |= Suffix::INTEL;
|
suffix_flag |= Suffix::INTEL;
|
||||||
@ -80,8 +80,8 @@ void ImproperCvffIntel::compute(int eflag, int vflag)
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void ImproperCvffIntel::compute(int eflag, int vflag,
|
void ImproperCvffIntel::compute(int eflag, int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
if (eflag || vflag) ev_setup(eflag,vflag);
|
if (eflag || vflag) ev_setup(eflag,vflag);
|
||||||
else evflag = 0;
|
else evflag = 0;
|
||||||
@ -89,14 +89,14 @@ void ImproperCvffIntel::compute(int eflag, int vflag,
|
|||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (vflag && !eflag) {
|
if (vflag && !eflag) {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<0,1,1>(vflag, buffers, fc);
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<0,1,0>(vflag, buffers, fc);
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -109,9 +109,9 @@ void ImproperCvffIntel::compute(int eflag, int vflag,
|
|||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void ImproperCvffIntel::eval(const int vflag,
|
void ImproperCvffIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
const int inum = neighbor->nimproperlist;
|
const int inum = neighbor->nimproperlist;
|
||||||
if (inum == 0) return;
|
if (inum == 0) return;
|
||||||
@ -153,7 +153,7 @@ void ImproperCvffIntel::eval(const int vflag,
|
|||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
memset(f, 0, f_stride * sizeof(FORCE_T));
|
memset(f, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
const int5_t * _noalias const improperlist =
|
const int5_t * _noalias const improperlist =
|
||||||
(int5_t *) neighbor->improperlist[0];
|
(int5_t *) neighbor->improperlist[0];
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||||
@ -230,22 +230,22 @@ void ImproperCvffIntel::eval(const int vflag,
|
|||||||
#ifndef LMP_INTEL_USE_SIMDOFF_FIX
|
#ifndef LMP_INTEL_USE_SIMDOFF_FIX
|
||||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||||
int me;
|
int me;
|
||||||
MPI_Comm_rank(world,&me);
|
MPI_Comm_rank(world,&me);
|
||||||
if (screen) {
|
if (screen) {
|
||||||
char str[128];
|
char str[128];
|
||||||
sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
|
sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
|
||||||
TAGINT_FORMAT " " TAGINT_FORMAT " "
|
TAGINT_FORMAT " " TAGINT_FORMAT " "
|
||||||
TAGINT_FORMAT " " TAGINT_FORMAT,
|
TAGINT_FORMAT " " TAGINT_FORMAT,
|
||||||
me,update->ntimestep,
|
me,update->ntimestep,
|
||||||
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
|
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
|
||||||
error->warning(FLERR,str,0);
|
error->warning(FLERR,str,0);
|
||||||
fprintf(screen," 1st atom: %d %g %g %g\n",
|
fprintf(screen," 1st atom: %d %g %g %g\n",
|
||||||
me,x[i1].x,x[i1].y,x[i1].z);
|
me,x[i1].x,x[i1].y,x[i1].z);
|
||||||
fprintf(screen," 2nd atom: %d %g %g %g\n",
|
fprintf(screen," 2nd atom: %d %g %g %g\n",
|
||||||
me,x[i2].x,x[i2].y,x[i2].z);
|
me,x[i2].x,x[i2].y,x[i2].z);
|
||||||
fprintf(screen," 3rd atom: %d %g %g %g\n",
|
fprintf(screen," 3rd atom: %d %g %g %g\n",
|
||||||
me,x[i3].x,x[i3].y,x[i3].z);
|
me,x[i3].x,x[i3].y,x[i3].z);
|
||||||
fprintf(screen," 4th atom: %d %g %g %g\n",
|
fprintf(screen," 4th atom: %d %g %g %g\n",
|
||||||
me,x[i4].x,x[i4].y,x[i4].z);
|
me,x[i4].x,x[i4].y,x[i4].z);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -268,35 +268,35 @@ void ImproperCvffIntel::eval(const int vflag,
|
|||||||
{
|
{
|
||||||
if (m == 2) {
|
if (m == 2) {
|
||||||
p = (flt_t)2.0*c*c;
|
p = (flt_t)2.0*c*c;
|
||||||
pd = (flt_t)2.0*c;
|
pd = (flt_t)2.0*c;
|
||||||
} else if (m == 3) {
|
} else if (m == 3) {
|
||||||
const flt_t rc2 = c*c;
|
const flt_t rc2 = c*c;
|
||||||
p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0;
|
p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0;
|
||||||
pd = (flt_t)6.0*rc2 - (flt_t)1.5;
|
pd = (flt_t)6.0*rc2 - (flt_t)1.5;
|
||||||
} else if (m == 4) {
|
} else if (m == 4) {
|
||||||
const flt_t rc2 = c*c;
|
const flt_t rc2 = c*c;
|
||||||
p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0;
|
p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0;
|
||||||
pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c;
|
pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c;
|
||||||
} else if (m == 6) {
|
} else if (m == 6) {
|
||||||
const flt_t rc2 = c*c;
|
const flt_t rc2 = c*c;
|
||||||
p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2;
|
p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2;
|
||||||
pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c;
|
pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c;
|
||||||
} else if (m == 1) {
|
} else if (m == 1) {
|
||||||
p = c + (flt_t)1.0;
|
p = c + (flt_t)1.0;
|
||||||
pd = (flt_t)0.5;
|
pd = (flt_t)0.5;
|
||||||
} else if (m == 5) {
|
} else if (m == 5) {
|
||||||
const flt_t rc2 = c*c;
|
const flt_t rc2 = c*c;
|
||||||
p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0;
|
p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0;
|
||||||
pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5;
|
pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5;
|
||||||
} else if (m == 0) {
|
} else if (m == 0) {
|
||||||
p = (flt_t)2.0;
|
p = (flt_t)2.0;
|
||||||
pd = (flt_t)0.0;
|
pd = (flt_t)0.0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fc.fc[type].sign == -1) {
|
if (fc.fc[type].sign == -1) {
|
||||||
p = (flt_t)2.0 - p;
|
p = (flt_t)2.0 - p;
|
||||||
pd = -pd;
|
pd = -pd;
|
||||||
}
|
}
|
||||||
|
|
||||||
flt_t eimproper;
|
flt_t eimproper;
|
||||||
@ -340,43 +340,43 @@ void ImproperCvffIntel::eval(const int vflag,
|
|||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
f[i1].x += f1x;
|
f[i1].x += f1x;
|
||||||
f[i1].y += f1y;
|
f[i1].y += f1y;
|
||||||
f[i1].z += f1z;
|
f[i1].z += f1z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i2 < nlocal) {
|
if (NEWTON_BOND || i2 < nlocal) {
|
||||||
f[i2].x += f2x;
|
f[i2].x += f2x;
|
||||||
f[i2].y += f2y;
|
f[i2].y += f2y;
|
||||||
f[i2].z += f2z;
|
f[i2].z += f2z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i3 < nlocal) {
|
if (NEWTON_BOND || i3 < nlocal) {
|
||||||
f[i3].x += f3x;
|
f[i3].x += f3x;
|
||||||
f[i3].y += f3y;
|
f[i3].y += f3y;
|
||||||
f[i3].z += f3z;
|
f[i3].z += f3z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i4 < nlocal) {
|
if (NEWTON_BOND || i4 < nlocal) {
|
||||||
f[i4].x += f4x;
|
f[i4].x += f4x;
|
||||||
f[i4].y += f4y;
|
f[i4].y += f4y;
|
||||||
f[i4].z += f4z;
|
f[i4].z += f4z;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (EFLAG || VFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
|
||||||
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
|
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
|
||||||
f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
|
f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
|
||||||
vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND,
|
vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND,
|
||||||
nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
|
nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
|
||||||
#else
|
#else
|
||||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
|
||||||
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
|
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
|
||||||
f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
|
f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
|
||||||
vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND,
|
vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND,
|
||||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
} // for n
|
} // for n
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||||
@ -390,7 +390,7 @@ void ImproperCvffIntel::eval(const int vflag,
|
|||||||
if (EFLAG) energy += oeimproper;
|
if (EFLAG) energy += oeimproper;
|
||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->set_reduce_flag();
|
fix->set_reduce_flag();
|
||||||
@ -428,7 +428,7 @@ void ImproperCvffIntel::init_style()
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc,
|
void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers)
|
IntelBuffers<flt_t,acc_t> *buffers)
|
||||||
{
|
{
|
||||||
const int bp1 = atom->nimpropertypes + 1;
|
const int bp1 = atom->nimpropertypes + 1;
|
||||||
fc.set_ntypes(bp1,memory);
|
fc.set_ntypes(bp1,memory);
|
||||||
@ -444,11 +444,11 @@ void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
|
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
void ImproperCvffIntel::ForceConst<flt_t>::set_ntypes(const int nimproper,
|
void ImproperCvffIntel::ForceConst<flt_t>::set_ntypes(const int nimproper,
|
||||||
Memory *memory) {
|
Memory *memory) {
|
||||||
if (nimproper != _nimpropertypes) {
|
if (nimproper != _nimpropertypes) {
|
||||||
if (_nimpropertypes > 0)
|
if (_nimpropertypes > 0)
|
||||||
_memory->destroy(fc);
|
_memory->destroy(fc);
|
||||||
|
|
||||||
if (nimproper > 0)
|
if (nimproper > 0)
|
||||||
_memory->create(fc,nimproper,"improperharmonicintel.fc");
|
_memory->create(fc,nimproper,"improperharmonicintel.fc");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -45,8 +45,8 @@ class ImproperCvffIntel : public ImproperCvff {
|
|||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t, acc_t> *buffers);
|
IntelBuffers<flt_t, acc_t> *buffers);
|
||||||
|
|||||||
@ -43,7 +43,7 @@ typedef struct { int a,b,c,d,t; } int5_t;
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
ImproperHarmonicIntel::ImproperHarmonicIntel(LAMMPS *lmp) :
|
ImproperHarmonicIntel::ImproperHarmonicIntel(LAMMPS *lmp) :
|
||||||
ImproperHarmonic(lmp)
|
ImproperHarmonic(lmp)
|
||||||
{
|
{
|
||||||
suffix_flag |= Suffix::INTEL;
|
suffix_flag |= Suffix::INTEL;
|
||||||
@ -81,8 +81,8 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag)
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void ImproperHarmonicIntel::compute(int eflag, int vflag,
|
void ImproperHarmonicIntel::compute(int eflag, int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
if (eflag || vflag) ev_setup(eflag,vflag);
|
if (eflag || vflag) ev_setup(eflag,vflag);
|
||||||
else evflag = 0;
|
else evflag = 0;
|
||||||
@ -90,14 +90,14 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag,
|
|||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (vflag && !eflag) {
|
if (vflag && !eflag) {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<0,1,1>(vflag, buffers, fc);
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<0,1,0>(vflag, buffers, fc);
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -110,9 +110,9 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag,
|
|||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void ImproperHarmonicIntel::eval(const int vflag,
|
void ImproperHarmonicIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
const int inum = neighbor->nimproperlist;
|
const int inum = neighbor->nimproperlist;
|
||||||
if (inum == 0) return;
|
if (inum == 0) return;
|
||||||
@ -154,7 +154,7 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
|||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
memset(f, 0, f_stride * sizeof(FORCE_T));
|
memset(f, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
const int5_t * _noalias const improperlist =
|
const int5_t * _noalias const improperlist =
|
||||||
(int5_t *) neighbor->improperlist[0];
|
(int5_t *) neighbor->improperlist[0];
|
||||||
|
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
@ -221,22 +221,22 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
|||||||
#ifndef LMP_INTEL_USE_SIMDOFF
|
#ifndef LMP_INTEL_USE_SIMDOFF
|
||||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||||
int me;
|
int me;
|
||||||
MPI_Comm_rank(world,&me);
|
MPI_Comm_rank(world,&me);
|
||||||
if (screen) {
|
if (screen) {
|
||||||
char str[128];
|
char str[128];
|
||||||
sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
|
sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
|
||||||
TAGINT_FORMAT " " TAGINT_FORMAT " "
|
TAGINT_FORMAT " " TAGINT_FORMAT " "
|
||||||
TAGINT_FORMAT " " TAGINT_FORMAT,
|
TAGINT_FORMAT " " TAGINT_FORMAT,
|
||||||
me,update->ntimestep,
|
me,update->ntimestep,
|
||||||
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
|
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
|
||||||
error->warning(FLERR,str,0);
|
error->warning(FLERR,str,0);
|
||||||
fprintf(screen," 1st atom: %d %g %g %g\n",
|
fprintf(screen," 1st atom: %d %g %g %g\n",
|
||||||
me,x[i1].x,x[i1].y,x[i1].z);
|
me,x[i1].x,x[i1].y,x[i1].z);
|
||||||
fprintf(screen," 2nd atom: %d %g %g %g\n",
|
fprintf(screen," 2nd atom: %d %g %g %g\n",
|
||||||
me,x[i2].x,x[i2].y,x[i2].z);
|
me,x[i2].x,x[i2].y,x[i2].z);
|
||||||
fprintf(screen," 3rd atom: %d %g %g %g\n",
|
fprintf(screen," 3rd atom: %d %g %g %g\n",
|
||||||
me,x[i3].x,x[i3].y,x[i3].z);
|
me,x[i3].x,x[i3].y,x[i3].z);
|
||||||
fprintf(screen," 4th atom: %d %g %g %g\n",
|
fprintf(screen," 4th atom: %d %g %g %g\n",
|
||||||
me,x[i4].x,x[i4].y,x[i4].z);
|
me,x[i4].x,x[i4].y,x[i4].z);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -296,43 +296,43 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
|||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
f[i1].x += f1x;
|
f[i1].x += f1x;
|
||||||
f[i1].y += f1y;
|
f[i1].y += f1y;
|
||||||
f[i1].z += f1z;
|
f[i1].z += f1z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i2 < nlocal) {
|
if (NEWTON_BOND || i2 < nlocal) {
|
||||||
f[i2].x += f2x;
|
f[i2].x += f2x;
|
||||||
f[i2].y += f2y;
|
f[i2].y += f2y;
|
||||||
f[i2].z += f2z;
|
f[i2].z += f2z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i3 < nlocal) {
|
if (NEWTON_BOND || i3 < nlocal) {
|
||||||
f[i3].x += f3x;
|
f[i3].x += f3x;
|
||||||
f[i3].y += f3y;
|
f[i3].y += f3y;
|
||||||
f[i3].z += f3z;
|
f[i3].z += f3z;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i4 < nlocal) {
|
if (NEWTON_BOND || i4 < nlocal) {
|
||||||
f[i4].x += f4x;
|
f[i4].x += f4x;
|
||||||
f[i4].y += f4y;
|
f[i4].y += f4y;
|
||||||
f[i4].z += f4z;
|
f[i4].z += f4z;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (EFLAG || VFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
|
||||||
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,
|
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,
|
||||||
f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
|
f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
|
||||||
vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND,
|
vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND,
|
||||||
nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
|
nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
|
||||||
#else
|
#else
|
||||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
|
||||||
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,
|
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,
|
||||||
f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
|
f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
|
||||||
vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND,
|
vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND,
|
||||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
} // for n
|
} // for n
|
||||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
@ -346,7 +346,7 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
|||||||
if (EFLAG) energy += oeimproper;
|
if (EFLAG) energy += oeimproper;
|
||||||
if (VFLAG && vflag) {
|
if (VFLAG && vflag) {
|
||||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->set_reduce_flag();
|
fix->set_reduce_flag();
|
||||||
@ -384,7 +384,7 @@ void ImproperHarmonicIntel::init_style()
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void ImproperHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
|
void ImproperHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers)
|
IntelBuffers<flt_t,acc_t> *buffers)
|
||||||
{
|
{
|
||||||
const int bp1 = atom->nimpropertypes + 1;
|
const int bp1 = atom->nimpropertypes + 1;
|
||||||
fc.set_ntypes(bp1,memory);
|
fc.set_ntypes(bp1,memory);
|
||||||
@ -399,11 +399,11 @@ void ImproperHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
|
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
void ImproperHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nimproper,
|
void ImproperHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nimproper,
|
||||||
Memory *memory) {
|
Memory *memory) {
|
||||||
if (nimproper != _nimpropertypes) {
|
if (nimproper != _nimpropertypes) {
|
||||||
if (_nimpropertypes > 0)
|
if (_nimpropertypes > 0)
|
||||||
_memory->destroy(fc);
|
_memory->destroy(fc);
|
||||||
|
|
||||||
if (nimproper > 0)
|
if (nimproper > 0)
|
||||||
_memory->create(fc,nimproper,"improperharmonicintel.fc");
|
_memory->create(fc,nimproper,"improperharmonicintel.fc");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -45,8 +45,8 @@ class ImproperHarmonicIntel : public ImproperHarmonic {
|
|||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t, acc_t> *buffers);
|
IntelBuffers<flt_t, acc_t> *buffers);
|
||||||
|
|||||||
@ -71,8 +71,8 @@ void IntelBuffers<flt_t, acc_t>::free_buffers()
|
|||||||
if (ev_global != 0) {
|
if (ev_global != 0) {
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(x:alloc_if(0) free_if(1)) \
|
nocopy(x:alloc_if(0) free_if(1)) \
|
||||||
nocopy(f_start:alloc_if(0) free_if(1)) \
|
nocopy(f_start:alloc_if(0) free_if(1)) \
|
||||||
nocopy(ev_global:alloc_if(0) free_if(1))
|
nocopy(ev_global:alloc_if(0) free_if(1))
|
||||||
}
|
}
|
||||||
|
|
||||||
if (q != 0) {
|
if (q != 0) {
|
||||||
@ -105,8 +105,8 @@ void IntelBuffers<flt_t, acc_t>::free_buffers()
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
|
void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
|
||||||
const int nthreads,
|
const int nthreads,
|
||||||
const int offload_end)
|
const int offload_end)
|
||||||
{
|
{
|
||||||
free_buffers();
|
free_buffers();
|
||||||
_buf_size = static_cast<double>(nall) * 1.1 + 1;
|
_buf_size = static_cast<double>(nall) * 1.1 + 1;
|
||||||
@ -151,15 +151,15 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
|
|||||||
if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) {
|
if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) {
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \
|
nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \
|
||||||
nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
|
nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
|
||||||
nocopy(ev_global:length(8) alloc_if(1) free_if(0))
|
nocopy(ev_global:length(8) alloc_if(1) free_if(0))
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (x != NULL && f_start != NULL && ev_global != NULL) {
|
if (x != NULL && f_start != NULL && ev_global != NULL) {
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \
|
nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \
|
||||||
nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
|
nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
|
||||||
nocopy(ev_global:length(8) alloc_if(1) free_if(0))
|
nocopy(ev_global:length(8) alloc_if(1) free_if(0))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (lmp->atom->ellipsoid != NULL) {
|
if (lmp->atom->ellipsoid != NULL) {
|
||||||
@ -186,7 +186,7 @@ void IntelBuffers<flt_t, acc_t>::free_nmax()
|
|||||||
if (tag != 0 && special != 0 && nspecial !=0) {
|
if (tag != 0 && special != 0 && nspecial !=0) {
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(tag:alloc_if(0) free_if(1)) \
|
nocopy(tag:alloc_if(0) free_if(1)) \
|
||||||
nocopy(special,nspecial:alloc_if(0) free_if(1))
|
nocopy(special,nspecial:alloc_if(0) free_if(1))
|
||||||
}
|
}
|
||||||
_off_map_nmax = 0;
|
_off_map_nmax = 0;
|
||||||
_host_nmax = 0;
|
_host_nmax = 0;
|
||||||
@ -261,7 +261,7 @@ void IntelBuffers<flt_t, acc_t>::free_list_local()
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list,
|
void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list,
|
||||||
const int offload_end)
|
const int offload_end)
|
||||||
{
|
{
|
||||||
free_list_local();
|
free_list_local();
|
||||||
int size = list->get_maxlocal();
|
int size = list->get_maxlocal();
|
||||||
@ -276,7 +276,7 @@ void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list,
|
|||||||
if (cnumneigh != 0) {
|
if (cnumneigh != 0) {
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(ilist:length(size) alloc_if(1) free_if(0)) \
|
nocopy(ilist:length(size) alloc_if(1) free_if(0)) \
|
||||||
nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \
|
nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \
|
||||||
nocopy(cnumneigh:length(size) alloc_if(1) free_if(0))
|
nocopy(cnumneigh:length(size) alloc_if(1) free_if(0))
|
||||||
}
|
}
|
||||||
_off_map_ilist = ilist;
|
_off_map_ilist = ilist;
|
||||||
@ -309,14 +309,14 @@ void IntelBuffers<flt_t, acc_t>::free_nbor_list()
|
|||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
|
void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
|
||||||
const int nlocal,
|
const int nlocal,
|
||||||
const int nthreads,
|
const int nthreads,
|
||||||
const int offload_end,
|
const int offload_end,
|
||||||
const int pack_width)
|
const int pack_width)
|
||||||
{
|
{
|
||||||
free_nbor_list();
|
free_nbor_list();
|
||||||
_list_alloc_atoms = 1.10 * nlocal;
|
_list_alloc_atoms = 1.10 * nlocal;
|
||||||
int nt = MAX(nthreads, _off_threads);
|
int nt = MAX(nthreads, _off_threads);
|
||||||
int list_alloc_size = (_list_alloc_atoms + nt * 2 + pack_width - 1) *
|
int list_alloc_size = (_list_alloc_atoms + nt * 2 + pack_width - 1) *
|
||||||
get_max_nbors();
|
get_max_nbors();
|
||||||
lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc");
|
lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc");
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
@ -380,8 +380,8 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
||||||
const int nthreads,
|
const int nthreads,
|
||||||
const int width)
|
const int width)
|
||||||
{
|
{
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (_ccachex && off_flag && _off_ccache == 0)
|
if (_ccachex && off_flag && _off_ccache == 0)
|
||||||
@ -418,7 +418,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
|||||||
int *ccachej = _ccachej;
|
int *ccachej = _ccachej;
|
||||||
|
|
||||||
if (ccachex != NULL && ccachey !=NULL && ccachez != NULL &&
|
if (ccachex != NULL && ccachey !=NULL && ccachez != NULL &&
|
||||||
ccachew != NULL && ccachei != NULL && ccachej !=NULL) {
|
ccachew != NULL && ccachei != NULL && ccachej !=NULL) {
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
|
nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
|
||||||
nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
|
nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
|
||||||
@ -471,7 +471,7 @@ void IntelBuffers<flt_t, acc_t>::free_ncache()
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
|
void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
|
||||||
const int nthreads)
|
const int nthreads)
|
||||||
{
|
{
|
||||||
const int nsize = get_max_nbors() * 3;
|
const int nsize = get_max_nbors() * 3;
|
||||||
int esize = MIN(sizeof(int), sizeof(flt_t));
|
int esize = MIN(sizeof(int), sizeof(flt_t));
|
||||||
@ -507,7 +507,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
|
|||||||
int *ncachejtype = _ncachejtype;
|
int *ncachejtype = _ncachejtype;
|
||||||
|
|
||||||
if (ncachex != NULL && ncachey !=NULL && ncachez != NULL &&
|
if (ncachex != NULL && ncachey !=NULL && ncachez != NULL &&
|
||||||
ncachej != NULL && ncachejtype != NULL) {
|
ncachej != NULL && ncachejtype != NULL) {
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(ncachex,ncachey:length(vsize) alloc_if(1) free_if(0)) \
|
nocopy(ncachex,ncachey:length(vsize) alloc_if(1) free_if(0)) \
|
||||||
nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \
|
nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \
|
||||||
@ -522,9 +522,9 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
|
|||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt,
|
void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt,
|
||||||
const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
|
const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
|
||||||
acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
|
acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
|
||||||
{
|
{
|
||||||
IP_PRE_fdotr_acc_force_l5(lf, lt, 0, nthreads, _f, f_stride, _x, ov0,
|
IP_PRE_fdotr_acc_force_l5(lf, lt, 0, nthreads, _f, f_stride, _x, ov0,
|
||||||
ov1, ov2, ov3, ov4, ov5);
|
ov1, ov2, ov3, ov4, ov5);
|
||||||
@ -535,13 +535,13 @@ void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt,
|
|||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall,
|
void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall,
|
||||||
const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
|
const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
|
||||||
acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
|
acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
|
||||||
{
|
{
|
||||||
int iifrom, iito, tid;
|
int iifrom, iito, tid;
|
||||||
IP_PRE_fdotr_acc_force(nall, 0, nthreads, _f, f_stride, _x, 0, 2,
|
IP_PRE_fdotr_acc_force(nall, 0, nthreads, _f, f_stride, _x, 0, 2,
|
||||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@ -62,7 +62,7 @@ class IntelBuffers {
|
|||||||
|
|
||||||
void free_buffers();
|
void free_buffers();
|
||||||
void free_nmax();
|
void free_nmax();
|
||||||
inline void set_bininfo(int *atombin, int *binpacked)
|
inline void set_bininfo(int *atombin, int *binpacked)
|
||||||
{ _atombin = atombin; _binpacked = binpacked; }
|
{ _atombin = atombin; _binpacked = binpacked; }
|
||||||
inline void grow(const int nall, const int nlocal, const int nthreads,
|
inline void grow(const int nall, const int nlocal, const int nthreads,
|
||||||
const int offload_end) {
|
const int offload_end) {
|
||||||
@ -126,7 +126,7 @@ class IntelBuffers {
|
|||||||
|
|
||||||
inline void grow_nbor_list(NeighList *list, const int nlocal,
|
inline void grow_nbor_list(NeighList *list, const int nlocal,
|
||||||
const int nthreads, const int offload_end,
|
const int nthreads, const int offload_end,
|
||||||
const int pack_width) {
|
const int pack_width) {
|
||||||
if (nlocal > _list_alloc_atoms)
|
if (nlocal > _list_alloc_atoms)
|
||||||
_grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
|
_grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
|
||||||
}
|
}
|
||||||
@ -165,7 +165,7 @@ class IntelBuffers {
|
|||||||
inline int get_off_threads() { return _off_threads; }
|
inline int get_off_threads() { return _off_threads; }
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
inline void set_off_params(const int n, const int cop,
|
inline void set_off_params(const int n, const int cop,
|
||||||
const int separate_buffers)
|
const int separate_buffers)
|
||||||
{ _off_threads = n; _cop = cop; _separate_buffers = separate_buffers; }
|
{ _off_threads = n; _cop = cop; _separate_buffers = separate_buffers; }
|
||||||
inline vec3_acc_t * get_off_f() { return _off_f; }
|
inline vec3_acc_t * get_off_f() { return _off_f; }
|
||||||
#endif
|
#endif
|
||||||
@ -191,17 +191,17 @@ class IntelBuffers {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
void fdotr_reduce_l5(const int lf, const int lt, const int nthreads,
|
void fdotr_reduce_l5(const int lf, const int lt, const int nthreads,
|
||||||
const int f_stride, acc_t &ov0, acc_t &ov1,
|
const int f_stride, acc_t &ov0, acc_t &ov1,
|
||||||
acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5);
|
acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5);
|
||||||
void fdotr_reduce(const int nall, const int nthreads, const int f_stride,
|
void fdotr_reduce(const int nall, const int nthreads, const int f_stride,
|
||||||
acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3,
|
acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3,
|
||||||
acc_t &ov4, acc_t &ov5);
|
acc_t &ov4, acc_t &ov5);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
inline void thr_pack_cop(const int ifrom, const int ito,
|
inline void thr_pack_cop(const int ifrom, const int ito,
|
||||||
const int offset, const bool dotype = false) {
|
const int offset, const bool dotype = false) {
|
||||||
double ** x = lmp->atom->x + offset;
|
double ** x = lmp->atom->x + offset;
|
||||||
if (dotype == false) {
|
if (dotype == false) {
|
||||||
#pragma vector nontemporal
|
#pragma vector nontemporal
|
||||||
@ -214,16 +214,16 @@ class IntelBuffers {
|
|||||||
int *type = lmp->atom->type + offset;
|
int *type = lmp->atom->type + offset;
|
||||||
#pragma vector nontemporal
|
#pragma vector nontemporal
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
_x[i].x = x[i][0];
|
_x[i].x = x[i][0];
|
||||||
_x[i].y = x[i][1];
|
_x[i].y = x[i][1];
|
||||||
_x[i].z = x[i][2];
|
_x[i].z = x[i][2];
|
||||||
_x[i].w = type[i];
|
_x[i].w = type[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void thr_pack_host(const int ifrom, const int ito,
|
inline void thr_pack_host(const int ifrom, const int ito,
|
||||||
const int offset) {
|
const int offset) {
|
||||||
double ** x = lmp->atom->x + offset;
|
double ** x = lmp->atom->x + offset;
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
_host_x[i].x = x[i][0];
|
_host_x[i].x = x[i][0];
|
||||||
@ -233,13 +233,13 @@ class IntelBuffers {
|
|||||||
}
|
}
|
||||||
|
|
||||||
inline void pack_sep_from_single(const int host_min_local,
|
inline void pack_sep_from_single(const int host_min_local,
|
||||||
const int used_local,
|
const int used_local,
|
||||||
const int host_min_ghost,
|
const int host_min_ghost,
|
||||||
const int used_ghost) {
|
const int used_ghost) {
|
||||||
memcpy(_host_x + host_min_local, _x + host_min_local,
|
memcpy(_host_x + host_min_local, _x + host_min_local,
|
||||||
used_local * sizeof(atom_t));
|
used_local * sizeof(atom_t));
|
||||||
memcpy(_host_x + host_min_local + used_local, _x + host_min_ghost,
|
memcpy(_host_x + host_min_local + used_local, _x + host_min_ghost,
|
||||||
used_ghost * sizeof(atom_t));
|
used_ghost * sizeof(atom_t));
|
||||||
int nall = used_local + used_ghost + host_min_local;
|
int nall = used_local + used_ghost + host_min_local;
|
||||||
_host_x[nall].x = INTEL_BIGP;
|
_host_x[nall].x = INTEL_BIGP;
|
||||||
_host_x[nall].y = INTEL_BIGP;
|
_host_x[nall].y = INTEL_BIGP;
|
||||||
@ -247,9 +247,9 @@ class IntelBuffers {
|
|||||||
_host_x[nall].w = 1;
|
_host_x[nall].w = 1;
|
||||||
if (lmp->atom->q != NULL) {
|
if (lmp->atom->q != NULL) {
|
||||||
memcpy(_host_q + host_min_local, _q + host_min_local,
|
memcpy(_host_q + host_min_local, _q + host_min_local,
|
||||||
used_local * sizeof(flt_t));
|
used_local * sizeof(flt_t));
|
||||||
memcpy(_host_q + host_min_local + used_local, _q + host_min_ghost,
|
memcpy(_host_q + host_min_local + used_local, _q + host_min_ghost,
|
||||||
used_ghost * sizeof(flt_t));
|
used_ghost * sizeof(flt_t));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -310,7 +310,7 @@ class IntelBuffers {
|
|||||||
_alignvar(acc_t _ev_global_host[8],64);
|
_alignvar(acc_t _ev_global_host[8],64);
|
||||||
|
|
||||||
void _grow(const int nall, const int nlocal, const int nthreads,
|
void _grow(const int nall, const int nlocal, const int nthreads,
|
||||||
const int offload_end);
|
const int offload_end);
|
||||||
void _grow_nmax(const int offload_end);
|
void _grow_nmax(const int offload_end);
|
||||||
void _grow_list_local(NeighList *list, const int offload_end);
|
void _grow_list_local(NeighList *list, const int offload_end);
|
||||||
void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads,
|
void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads,
|
||||||
|
|||||||
@ -46,23 +46,23 @@ struct lmp_intel_an_fvec {
|
|||||||
lmp_intel_an_fvec(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; }
|
lmp_intel_an_fvec(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; }
|
||||||
lmp_intel_an_fvec& operator =(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; return *this; }
|
lmp_intel_an_fvec& operator =(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; return *this; }
|
||||||
const lmp_intel_an_fvec operator +(const lmp_intel_an_fvec &b) const {
|
const lmp_intel_an_fvec operator +(const lmp_intel_an_fvec &b) const {
|
||||||
lmp_intel_an_fvec ret = *this;
|
lmp_intel_an_fvec ret = *this;
|
||||||
ret.data[:] += b.data[:];
|
ret.data[:] += b.data[:];
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
const lmp_intel_an_fvec operator -(const lmp_intel_an_fvec &b) const {
|
const lmp_intel_an_fvec operator -(const lmp_intel_an_fvec &b) const {
|
||||||
lmp_intel_an_fvec ret = *this;
|
lmp_intel_an_fvec ret = *this;
|
||||||
ret.data[:] -= b.data[:];
|
ret.data[:] -= b.data[:];
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
const lmp_intel_an_fvec operator *(const lmp_intel_an_fvec &b) const {
|
const lmp_intel_an_fvec operator *(const lmp_intel_an_fvec &b) const {
|
||||||
lmp_intel_an_fvec ret = *this;
|
lmp_intel_an_fvec ret = *this;
|
||||||
ret.data[:] *= b.data[:];
|
ret.data[:] *= b.data[:];
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
const lmp_intel_an_fvec operator /(const lmp_intel_an_fvec &b) const {
|
const lmp_intel_an_fvec operator /(const lmp_intel_an_fvec &b) const {
|
||||||
lmp_intel_an_fvec ret = *this;
|
lmp_intel_an_fvec ret = *this;
|
||||||
ret.data[:] /= b.data[:];
|
ret.data[:] /= b.data[:];
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
lmp_intel_an_fvec& operator +=(const lmp_intel_an_fvec &b) {
|
lmp_intel_an_fvec& operator +=(const lmp_intel_an_fvec &b) {
|
||||||
@ -103,18 +103,18 @@ struct lmp_intel_an_ivec {
|
|||||||
explicit lmp_intel_an_ivec(int i) { data[:] = i; }
|
explicit lmp_intel_an_ivec(int i) { data[:] = i; }
|
||||||
explicit lmp_intel_an_ivec(const int * a) { data[:] = a[0:VL]; }
|
explicit lmp_intel_an_ivec(const int * a) { data[:] = a[0:VL]; }
|
||||||
const lmp_intel_an_ivec operator &(const lmp_intel_an_ivec &b) {
|
const lmp_intel_an_ivec operator &(const lmp_intel_an_ivec &b) {
|
||||||
lmp_intel_an_ivec ret = *this;
|
lmp_intel_an_ivec ret = *this;
|
||||||
ret.data[:] &= b.data[:];
|
ret.data[:] &= b.data[:];
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
const lmp_intel_an_ivec operator |(const lmp_intel_an_ivec &b) {
|
const lmp_intel_an_ivec operator |(const lmp_intel_an_ivec &b) {
|
||||||
lmp_intel_an_ivec ret = *this;
|
lmp_intel_an_ivec ret = *this;
|
||||||
ret.data[:] |= b.data[:];
|
ret.data[:] |= b.data[:];
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
const lmp_intel_an_ivec operator +(const lmp_intel_an_ivec &b) {
|
const lmp_intel_an_ivec operator +(const lmp_intel_an_ivec &b) {
|
||||||
lmp_intel_an_ivec ret = *this;
|
lmp_intel_an_ivec ret = *this;
|
||||||
ret.data[:] += b.data[:];
|
ret.data[:] += b.data[:];
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -171,13 +171,13 @@ enum CalculationMode { KNC, AVX, AVX2, SSE, NONE, AN };
|
|||||||
|
|
||||||
// This is used in the selection logic
|
// This is used in the selection logic
|
||||||
template<CalculationMode mode>
|
template<CalculationMode mode>
|
||||||
struct vector_traits {
|
struct vector_traits {
|
||||||
static const bool support_integer_and_gather_ops = true;
|
static const bool support_integer_and_gather_ops = true;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
struct vector_traits<AVX> {
|
struct vector_traits<AVX> {
|
||||||
static const bool support_integer_and_gather_ops = false;
|
static const bool support_integer_and_gather_ops = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
// This is the base template for all the different architectures
|
// This is the base template for all the different architectures
|
||||||
@ -198,10 +198,10 @@ struct ivec32x16 {
|
|||||||
}
|
}
|
||||||
explicit ivec32x16(int i) { vec = _mm512_set1_epi32(i); }
|
explicit ivec32x16(int i) { vec = _mm512_set1_epi32(i); }
|
||||||
operator __m512i() const { return vec; }
|
operator __m512i() const { return vec; }
|
||||||
friend ivec32x16 operator &(const ivec32x16 &a, const ivec32x16 &b) {
|
friend ivec32x16 operator &(const ivec32x16 &a, const ivec32x16 &b) {
|
||||||
return _mm512_and_epi32(a, b);
|
return _mm512_and_epi32(a, b);
|
||||||
}
|
}
|
||||||
friend ivec32x16 operator |(const ivec32x16 &a, const ivec32x16 &b) {
|
friend ivec32x16 operator |(const ivec32x16 &a, const ivec32x16 &b) {
|
||||||
return _mm512_or_epi32(a, b);
|
return _mm512_or_epi32(a, b);
|
||||||
}
|
}
|
||||||
friend ivec32x16 operator +(const ivec32x16 &a, const ivec32x16 &b) {
|
friend ivec32x16 operator +(const ivec32x16 &a, const ivec32x16 &b) {
|
||||||
@ -326,7 +326,7 @@ struct vector_ops<double, KNC> {
|
|||||||
*z = gather<1>(*z, mask, idxs, &base->z);
|
*z = gather<1>(*z, mask, idxs, &base->z);
|
||||||
*w = int_gather<1>(*w, mask, idxs, &base->w);
|
*w = int_gather<1>(*w, mask, idxs, &base->w);
|
||||||
}
|
}
|
||||||
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
||||||
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
|
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
|
||||||
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 8);
|
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 8);
|
||||||
@ -337,7 +337,7 @@ struct vector_ops<double, KNC> {
|
|||||||
*r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 48);
|
*r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 48);
|
||||||
*r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 56);
|
*r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 56);
|
||||||
}
|
}
|
||||||
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
||||||
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
|
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
|
||||||
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 8);
|
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 8);
|
||||||
@ -464,7 +464,7 @@ struct vector_ops<float, KNC> {
|
|||||||
*z = gather<1>(*z, mask, idxs, &base->z);
|
*z = gather<1>(*z, mask, idxs, &base->z);
|
||||||
*w = int_gather<1>(*w, mask, idxs, &base->w);
|
*w = int_gather<1>(*w, mask, idxs, &base->w);
|
||||||
}
|
}
|
||||||
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
||||||
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
|
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
|
||||||
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
|
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
|
||||||
@ -475,7 +475,7 @@ struct vector_ops<float, KNC> {
|
|||||||
*r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 24);
|
*r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 24);
|
||||||
*r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 28);
|
*r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 28);
|
||||||
}
|
}
|
||||||
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
||||||
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
|
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
|
||||||
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
|
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
|
||||||
@ -519,10 +519,10 @@ struct ivec32x8 {
|
|||||||
}
|
}
|
||||||
explicit ivec32x8(int i) { vec = _mm256_set1_epi32(i); }
|
explicit ivec32x8(int i) { vec = _mm256_set1_epi32(i); }
|
||||||
operator __m256i() const { return vec; }
|
operator __m256i() const { return vec; }
|
||||||
friend ivec32x8 operator &(const ivec32x8 &a, const ivec32x8 &b) {
|
friend ivec32x8 operator &(const ivec32x8 &a, const ivec32x8 &b) {
|
||||||
return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
|
return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
|
||||||
}
|
}
|
||||||
friend ivec32x8 operator |(const ivec32x8 &a, const ivec32x8 &b) {
|
friend ivec32x8 operator |(const ivec32x8 &a, const ivec32x8 &b) {
|
||||||
return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
|
return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
|
||||||
}
|
}
|
||||||
friend ivec32x8 operator +(const ivec32x8 &a, const ivec32x8 &b) {
|
friend ivec32x8 operator +(const ivec32x8 &a, const ivec32x8 &b) {
|
||||||
@ -545,10 +545,10 @@ struct avx_bvec {
|
|||||||
operator F64vec4() const { return _mm256_castsi256_pd(vec); }
|
operator F64vec4() const { return _mm256_castsi256_pd(vec); }
|
||||||
operator F32vec8() const { return _mm256_castsi256_ps(vec); }
|
operator F32vec8() const { return _mm256_castsi256_ps(vec); }
|
||||||
operator ivec32x8() const { return vec; }
|
operator ivec32x8() const { return vec; }
|
||||||
friend avx_bvec operator &(const avx_bvec &a, const avx_bvec &b) {
|
friend avx_bvec operator &(const avx_bvec &a, const avx_bvec &b) {
|
||||||
return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
|
return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
|
||||||
}
|
}
|
||||||
friend avx_bvec operator |(const avx_bvec &a, const avx_bvec &b) {
|
friend avx_bvec operator |(const avx_bvec &a, const avx_bvec &b) {
|
||||||
return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
|
return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
|
||||||
}
|
}
|
||||||
friend avx_bvec operator ~(const avx_bvec &a) { return _mm256_castpd_si256(_mm256_andnot_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(avx_bvec(0xFFFFFFFF)))); }
|
friend avx_bvec operator ~(const avx_bvec &a) { return _mm256_castpd_si256(_mm256_andnot_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(avx_bvec(0xFFFFFFFF)))); }
|
||||||
@ -582,8 +582,8 @@ struct vector_ops<double, AVX> {
|
|||||||
_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
|
_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
|
||||||
_mm256_store_pd(reinterpret_cast<double*>(src), from);
|
_mm256_store_pd(reinterpret_cast<double*>(src), from);
|
||||||
for (int i = 0; i < VL; i++) {
|
for (int i = 0; i < VL; i++) {
|
||||||
result[i] = mask_test_at(mask, i)
|
result[i] = mask_test_at(mask, i)
|
||||||
? *reinterpret_cast<const double*>(reinterpret_cast<const char*>(base) + scale * idxs[2*i])
|
? *reinterpret_cast<const double*>(reinterpret_cast<const char*>(base) + scale * idxs[2*i])
|
||||||
: src[i];
|
: src[i];
|
||||||
}
|
}
|
||||||
return _mm256_load_pd(reinterpret_cast<double*>(result));
|
return _mm256_load_pd(reinterpret_cast<double*>(result));
|
||||||
@ -605,18 +605,18 @@ struct vector_ops<double, AVX> {
|
|||||||
__m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20);
|
__m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20);
|
||||||
__m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31);
|
__m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31);
|
||||||
__m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31);
|
__m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31);
|
||||||
*x = blend(mask, *x, c0);
|
*x = blend(mask, *x, c0);
|
||||||
*y = blend(mask, *y, c1);
|
*y = blend(mask, *y, c1);
|
||||||
*z = blend(mask, *z, c2);
|
*z = blend(mask, *z, c2);
|
||||||
*w = int_blend(mask, *w, _mm256_castps_si256(_mm256_permute_ps(_mm256_castpd_ps(c3), 0xA0)));
|
*w = int_blend(mask, *w, _mm256_castps_si256(_mm256_permute_ps(_mm256_castpd_ps(c3), 0xA0)));
|
||||||
}
|
}
|
||||||
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
||||||
fvec a = zero(), b = zero(), c = zero(), d = zero();
|
fvec a = zero(), b = zero(), c = zero(), d = zero();
|
||||||
gather_4(idxs, mask, base, r0, r1, r2, r3);
|
gather_4(idxs, mask, base, r0, r1, r2, r3);
|
||||||
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
|
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
|
||||||
}
|
}
|
||||||
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
||||||
iarr i, m;
|
iarr i, m;
|
||||||
_mm256_store_si256(reinterpret_cast<__m256i*>(i), idxs);
|
_mm256_store_si256(reinterpret_cast<__m256i*>(i), idxs);
|
||||||
@ -642,10 +642,10 @@ struct vector_ops<double, AVX> {
|
|||||||
__m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20);
|
__m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20);
|
||||||
__m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31);
|
__m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31);
|
||||||
__m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31);
|
__m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31);
|
||||||
*r0 = blend(mask, *r0, c0);
|
*r0 = blend(mask, *r0, c0);
|
||||||
*r1 = blend(mask, *r1, c1);
|
*r1 = blend(mask, *r1, c1);
|
||||||
*r2 = blend(mask, *r2, c2);
|
*r2 = blend(mask, *r2, c2);
|
||||||
*r3 = blend(mask, *r3, c3);
|
*r3 = blend(mask, *r3, c3);
|
||||||
}
|
}
|
||||||
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
|
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
|
||||||
return (b & mask) | (a & ~ mask);
|
return (b & mask) | (a & ~ mask);
|
||||||
@ -809,8 +809,8 @@ struct vector_ops<float, AVX> {
|
|||||||
_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
|
_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
|
||||||
_mm256_store_ps(reinterpret_cast<float*>(src), from);
|
_mm256_store_ps(reinterpret_cast<float*>(src), from);
|
||||||
for (int i = 0; i < VL; i++) {
|
for (int i = 0; i < VL; i++) {
|
||||||
result[i] = mask_test_at(mask, i)
|
result[i] = mask_test_at(mask, i)
|
||||||
? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
|
? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
|
||||||
: src[i];
|
: src[i];
|
||||||
}
|
}
|
||||||
return _mm256_load_ps(reinterpret_cast<float*>(result));
|
return _mm256_load_ps(reinterpret_cast<float*>(result));
|
||||||
@ -842,18 +842,18 @@ struct vector_ops<float, AVX> {
|
|||||||
__m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE);
|
__m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE);
|
||||||
__m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44);
|
__m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44);
|
||||||
__m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE);
|
__m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE);
|
||||||
*x = blend(mask, *x, c0);
|
*x = blend(mask, *x, c0);
|
||||||
*y = blend(mask, *y, c1);
|
*y = blend(mask, *y, c1);
|
||||||
*z = blend(mask, *z, c2);
|
*z = blend(mask, *z, c2);
|
||||||
*w = int_blend(mask, *w, _mm256_castps_si256(c3));
|
*w = int_blend(mask, *w, _mm256_castps_si256(c3));
|
||||||
}
|
}
|
||||||
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
||||||
fvec a = zero(), b = zero(), c = zero(), d = zero();
|
fvec a = zero(), b = zero(), c = zero(), d = zero();
|
||||||
gather_4(idxs, mask, base, r0, r1, r2, r3);
|
gather_4(idxs, mask, base, r0, r1, r2, r3);
|
||||||
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7);
|
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7);
|
||||||
}
|
}
|
||||||
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
||||||
iarr i, m;
|
iarr i, m;
|
||||||
int_store(i, idxs);
|
int_store(i, idxs);
|
||||||
@ -880,10 +880,10 @@ struct vector_ops<float, AVX> {
|
|||||||
__m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE);
|
__m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE);
|
||||||
__m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44);
|
__m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44);
|
||||||
__m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE);
|
__m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE);
|
||||||
*r0 = blend(mask, *r0, c0);
|
*r0 = blend(mask, *r0, c0);
|
||||||
*r1 = blend(mask, *r1, c1);
|
*r1 = blend(mask, *r1, c1);
|
||||||
*r2 = blend(mask, *r2, c2);
|
*r2 = blend(mask, *r2, c2);
|
||||||
*r3 = blend(mask, *r3, c3);
|
*r3 = blend(mask, *r3, c3);
|
||||||
}
|
}
|
||||||
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
|
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
|
||||||
return (b & mask) | (a & ~ mask);
|
return (b & mask) | (a & ~ mask);
|
||||||
@ -961,8 +961,8 @@ struct vector_ops<float, AVX> {
|
|||||||
_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
|
_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
|
||||||
_mm256_store_si256(reinterpret_cast<__m256i*>(src), from);
|
_mm256_store_si256(reinterpret_cast<__m256i*>(src), from);
|
||||||
for (int i = 0; i < VL; i++) {
|
for (int i = 0; i < VL; i++) {
|
||||||
result[i] = mask_test_at(mask, i)
|
result[i] = mask_test_at(mask, i)
|
||||||
? *reinterpret_cast<const int*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
|
? *reinterpret_cast<const int*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
|
||||||
: src[i];
|
: src[i];
|
||||||
}
|
}
|
||||||
return _mm256_load_si256(reinterpret_cast<__m256i*>(result));
|
return _mm256_load_si256(reinterpret_cast<__m256i*>(result));
|
||||||
@ -1038,10 +1038,10 @@ struct avx2_ivec32 {
|
|||||||
}
|
}
|
||||||
explicit avx2_ivec32(int i) { vec = _mm256_set1_epi32(i); }
|
explicit avx2_ivec32(int i) { vec = _mm256_set1_epi32(i); }
|
||||||
operator __m256i() const { return vec; }
|
operator __m256i() const { return vec; }
|
||||||
friend avx2_ivec32 operator &(const avx2_ivec32 &a, const avx2_ivec32 &b) {
|
friend avx2_ivec32 operator &(const avx2_ivec32 &a, const avx2_ivec32 &b) {
|
||||||
return _mm256_and_si256(a, b);
|
return _mm256_and_si256(a, b);
|
||||||
}
|
}
|
||||||
friend avx2_ivec32 operator |(const avx2_ivec32 &a, const avx2_ivec32 &b) {
|
friend avx2_ivec32 operator |(const avx2_ivec32 &a, const avx2_ivec32 &b) {
|
||||||
return _mm256_or_si256(a, b);
|
return _mm256_or_si256(a, b);
|
||||||
}
|
}
|
||||||
friend avx2_ivec32 operator +(const avx2_ivec32 &a, const avx2_ivec32 &b) {
|
friend avx2_ivec32 operator +(const avx2_ivec32 &a, const avx2_ivec32 &b) {
|
||||||
@ -1060,14 +1060,14 @@ struct avx2_bvec {
|
|||||||
operator F64vec4() const { return _mm256_castsi256_pd(vec); }
|
operator F64vec4() const { return _mm256_castsi256_pd(vec); }
|
||||||
operator F32vec8() const { return _mm256_castsi256_ps(vec); }
|
operator F32vec8() const { return _mm256_castsi256_ps(vec); }
|
||||||
operator avx2_ivec32() const { return vec; }
|
operator avx2_ivec32() const { return vec; }
|
||||||
friend avx2_bvec operator &(const avx2_bvec &a, const avx2_bvec &b) {
|
friend avx2_bvec operator &(const avx2_bvec &a, const avx2_bvec &b) {
|
||||||
return _mm256_and_si256(a, b);
|
return _mm256_and_si256(a, b);
|
||||||
}
|
}
|
||||||
friend avx2_bvec operator |(const avx2_bvec &a, const avx2_bvec &b) {
|
friend avx2_bvec operator |(const avx2_bvec &a, const avx2_bvec &b) {
|
||||||
return _mm256_or_si256(a, b);
|
return _mm256_or_si256(a, b);
|
||||||
}
|
}
|
||||||
friend avx2_bvec operator ~(const avx2_bvec &a) {
|
friend avx2_bvec operator ~(const avx2_bvec &a) {
|
||||||
return _mm256_andnot_si256(a, avx2_bvec(0xFFFFFFFF));
|
return _mm256_andnot_si256(a, avx2_bvec(0xFFFFFFFF));
|
||||||
}
|
}
|
||||||
avx2_bvec& operator &=(const avx2_bvec &a) { return *this = _mm256_and_si256(vec,a); }
|
avx2_bvec& operator &=(const avx2_bvec &a) { return *this = _mm256_and_si256(vec,a); }
|
||||||
};
|
};
|
||||||
@ -1106,13 +1106,13 @@ struct vector_ops<double, AVX2> {
|
|||||||
*z = _mm256_mask_i32gather_pd(*z, &base->z, _mm256_castsi256_si128(idx1), mask, 1);
|
*z = _mm256_mask_i32gather_pd(*z, &base->z, _mm256_castsi256_si128(idx1), mask, 1);
|
||||||
*w = _mm256_mask_i32gather_epi32(*w, &base->w, idx, mask, 1);
|
*w = _mm256_mask_i32gather_epi32(*w, &base->w, idx, mask, 1);
|
||||||
}
|
}
|
||||||
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
||||||
fvec a = zero(), b = zero(), c = zero(), d = zero();
|
fvec a = zero(), b = zero(), c = zero(), d = zero();
|
||||||
gather_4(idxs, mask, base, r0, r1, r2, r3);
|
gather_4(idxs, mask, base, r0, r1, r2, r3);
|
||||||
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
|
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
|
||||||
}
|
}
|
||||||
static void gather_4(const ivec &idx, const bvec &mask, const void *base,
|
static void gather_4(const ivec &idx, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
||||||
ivec idx0 = _mm256_shuffle_epi32(idx, 0xD8); // 11011000 ->3120
|
ivec idx0 = _mm256_shuffle_epi32(idx, 0xD8); // 11011000 ->3120
|
||||||
ivec idx1 = _mm256_permute4x64_epi64(idx0, 0xD8);
|
ivec idx1 = _mm256_permute4x64_epi64(idx0, 0xD8);
|
||||||
@ -1253,7 +1253,7 @@ struct vector_ops<float, AVX2> {
|
|||||||
*z = _mm256_mask_i32gather_ps(*z, reinterpret_cast<const float*>(base) + 2, idx, mask, 1);
|
*z = _mm256_mask_i32gather_ps(*z, reinterpret_cast<const float*>(base) + 2, idx, mask, 1);
|
||||||
*w = _mm256_mask_i32gather_epi32(*w, reinterpret_cast<const int*>(base) + 3, idx, mask, 1);
|
*w = _mm256_mask_i32gather_epi32(*w, reinterpret_cast<const int*>(base) + 3, idx, mask, 1);
|
||||||
}
|
}
|
||||||
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
||||||
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
|
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
|
||||||
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
|
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
|
||||||
@ -1264,7 +1264,7 @@ struct vector_ops<float, AVX2> {
|
|||||||
*r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 24);
|
*r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 24);
|
||||||
*r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 28);
|
*r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 28);
|
||||||
}
|
}
|
||||||
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
||||||
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
|
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
|
||||||
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
|
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
|
||||||
@ -1401,10 +1401,10 @@ struct ivec32x4 {
|
|||||||
}
|
}
|
||||||
explicit ivec32x4(int i) { vec = _mm_set1_epi32(i); }
|
explicit ivec32x4(int i) { vec = _mm_set1_epi32(i); }
|
||||||
operator __m128i() const { return vec; }
|
operator __m128i() const { return vec; }
|
||||||
friend ivec32x4 operator &(const ivec32x4 &a, const ivec32x4 &b) {
|
friend ivec32x4 operator &(const ivec32x4 &a, const ivec32x4 &b) {
|
||||||
return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
|
return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
|
||||||
}
|
}
|
||||||
friend ivec32x4 operator |(const ivec32x4 &a, const ivec32x4 &b) {
|
friend ivec32x4 operator |(const ivec32x4 &a, const ivec32x4 &b) {
|
||||||
return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
|
return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
|
||||||
}
|
}
|
||||||
friend ivec32x4 operator +(const ivec32x4 &a, const ivec32x4 &b) {
|
friend ivec32x4 operator +(const ivec32x4 &a, const ivec32x4 &b) {
|
||||||
@ -1420,10 +1420,10 @@ struct sse_bvecx4 {
|
|||||||
operator __m128i() const { return vec; }
|
operator __m128i() const { return vec; }
|
||||||
operator F64vec2() const { return _mm_castsi128_pd(vec); }
|
operator F64vec2() const { return _mm_castsi128_pd(vec); }
|
||||||
operator ivec32x4() const { return vec; }
|
operator ivec32x4() const { return vec; }
|
||||||
friend sse_bvecx4 operator &(const sse_bvecx4 &a, const sse_bvecx4 &b) {
|
friend sse_bvecx4 operator &(const sse_bvecx4 &a, const sse_bvecx4 &b) {
|
||||||
return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
|
return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
|
||||||
}
|
}
|
||||||
friend sse_bvecx4 operator |(const sse_bvecx4 &a, const sse_bvecx4 &b) {
|
friend sse_bvecx4 operator |(const sse_bvecx4 &a, const sse_bvecx4 &b) {
|
||||||
return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
|
return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
|
||||||
}
|
}
|
||||||
friend sse_bvecx4 operator ~(const sse_bvecx4 &a) { return _mm_castpd_si128(_mm_andnot_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(sse_bvecx4(0xFFFFFFFF)))); }
|
friend sse_bvecx4 operator ~(const sse_bvecx4 &a) { return _mm_castpd_si128(_mm_andnot_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(sse_bvecx4(0xFFFFFFFF)))); }
|
||||||
@ -1477,18 +1477,18 @@ struct vector_ops<double, SSE> {
|
|||||||
__m128d c1 = _mm_unpackhi_pd(a0lo, a1lo);
|
__m128d c1 = _mm_unpackhi_pd(a0lo, a1lo);
|
||||||
__m128d c2 = _mm_unpacklo_pd(a0hi, a1hi);
|
__m128d c2 = _mm_unpacklo_pd(a0hi, a1hi);
|
||||||
__m128d c3 = _mm_unpackhi_pd(a0hi, a1hi);
|
__m128d c3 = _mm_unpackhi_pd(a0hi, a1hi);
|
||||||
*x = blend(mask, *x, c0);
|
*x = blend(mask, *x, c0);
|
||||||
*y = blend(mask, *y, c1);
|
*y = blend(mask, *y, c1);
|
||||||
*z = blend(mask, *z, c2);
|
*z = blend(mask, *z, c2);
|
||||||
*w = int_blend(mask, *w, _mm_shuffle_epi32(_mm_castpd_si128(c3), 0xA0));
|
*w = int_blend(mask, *w, _mm_shuffle_epi32(_mm_castpd_si128(c3), 0xA0));
|
||||||
}
|
}
|
||||||
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
||||||
fvec a = zero(), b = zero(), c = zero(), d = zero();
|
fvec a = zero(), b = zero(), c = zero(), d = zero();
|
||||||
gather_4(idxs, mask, base, r0, r1, r2, r3);
|
gather_4(idxs, mask, base, r0, r1, r2, r3);
|
||||||
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
|
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
|
||||||
}
|
}
|
||||||
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
||||||
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0);
|
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0);
|
||||||
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 8);
|
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 8);
|
||||||
@ -1634,8 +1634,8 @@ struct vector_ops<float, SSE> {
|
|||||||
_mm_store_si128(reinterpret_cast<__m128i*>(idxs), idx);
|
_mm_store_si128(reinterpret_cast<__m128i*>(idxs), idx);
|
||||||
_mm_store_ps(reinterpret_cast<float*>(src), from);
|
_mm_store_ps(reinterpret_cast<float*>(src), from);
|
||||||
for (int i = 0; i < VL; i++) {
|
for (int i = 0; i < VL; i++) {
|
||||||
result[i] = m[i]
|
result[i] = m[i]
|
||||||
? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
|
? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
|
||||||
: src[i];
|
: src[i];
|
||||||
}
|
}
|
||||||
return _mm_load_ps(reinterpret_cast<float*>(result));
|
return _mm_load_ps(reinterpret_cast<float*>(result));
|
||||||
@ -1647,13 +1647,13 @@ struct vector_ops<float, SSE> {
|
|||||||
*z = gather<1>(*z, mask, idxs, &base->z);
|
*z = gather<1>(*z, mask, idxs, &base->z);
|
||||||
*w = int_gather<1>(*w, mask, idxs, &base->w);
|
*w = int_gather<1>(*w, mask, idxs, &base->w);
|
||||||
}
|
}
|
||||||
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
||||||
fvec a = zero(), b = zero(), c = zero(), d = zero();
|
fvec a = zero(), b = zero(), c = zero(), d = zero();
|
||||||
gather_4(idxs, mask, base, r0, r1, r2, r3);
|
gather_4(idxs, mask, base, r0, r1, r2, r3);
|
||||||
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7);
|
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7);
|
||||||
}
|
}
|
||||||
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
||||||
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0);
|
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0);
|
||||||
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 4);
|
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 4);
|
||||||
@ -1816,13 +1816,13 @@ struct vector_ops<flt_t, NONE> {
|
|||||||
*z = gather<1>(*z, mask, idxs, &base->z);
|
*z = gather<1>(*z, mask, idxs, &base->z);
|
||||||
*w = int_gather<1>(*w, mask, idxs, &base->w);
|
*w = int_gather<1>(*w, mask, idxs, &base->w);
|
||||||
}
|
}
|
||||||
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
||||||
fvec a = zero(), b = zero(), c = zero(), d = zero();
|
fvec a = zero(), b = zero(), c = zero(), d = zero();
|
||||||
gather_4(idxs, mask, base, r0, r1, r2, r3);
|
gather_4(idxs, mask, base, r0, r1, r2, r3);
|
||||||
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 4 * sizeof(fscal), r4, r5, r6, r7);
|
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 4 * sizeof(fscal), r4, r5, r6, r7);
|
||||||
}
|
}
|
||||||
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
||||||
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0 * sizeof(fscal));
|
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0 * sizeof(fscal));
|
||||||
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 1 * sizeof(fscal));
|
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 1 * sizeof(fscal));
|
||||||
@ -1946,13 +1946,13 @@ struct vector_ops<flt_t, AN> {
|
|||||||
*z = gather<1>(*z, mask, idxs, &base->z);
|
*z = gather<1>(*z, mask, idxs, &base->z);
|
||||||
*w = int_gather<1>(*w, mask, idxs, &base->w);
|
*w = int_gather<1>(*w, mask, idxs, &base->w);
|
||||||
}
|
}
|
||||||
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
|
||||||
fvec a = zero(), b = zero(), c = zero(), d = zero();
|
fvec a = zero(), b = zero(), c = zero(), d = zero();
|
||||||
gather_4(idxs, mask, base, r0, r1, r2, r3);
|
gather_4(idxs, mask, base, r0, r1, r2, r3);
|
||||||
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 4 * sizeof(fscal), r4, r5, r6, r7);
|
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 4 * sizeof(fscal), r4, r5, r6, r7);
|
||||||
}
|
}
|
||||||
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
|
||||||
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
|
||||||
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0 * sizeof(fscal));
|
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0 * sizeof(fscal));
|
||||||
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 1 * sizeof(fscal));
|
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 1 * sizeof(fscal));
|
||||||
@ -2113,7 +2113,7 @@ struct AccumulatorTwiceMixin {
|
|||||||
|
|
||||||
typedef avec_t avec;
|
typedef avec_t avec;
|
||||||
typedef typename HIGH::fscal aarr[BASE::VL] __attribute__((aligned(BASE::ALIGN)));
|
typedef typename HIGH::fscal aarr[BASE::VL] __attribute__((aligned(BASE::ALIGN)));
|
||||||
|
|
||||||
static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) {
|
static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) {
|
||||||
typename HIGH::fvec blo = BASE::cvtup_lo(b);
|
typename HIGH::fvec blo = BASE::cvtup_lo(b);
|
||||||
typename HIGH::fvec bhi = BASE::cvtup_hi(b);
|
typename HIGH::fvec bhi = BASE::cvtup_hi(b);
|
||||||
@ -2121,7 +2121,7 @@ struct AccumulatorTwiceMixin {
|
|||||||
BASE::mask_cvtup(m, &mlo, &mhi);
|
BASE::mask_cvtup(m, &mlo, &mhi);
|
||||||
return avec(HIGH::mask_add(src.lo, mlo, a.lo, blo), HIGH::mask_add(src.hi, mhi, a.hi, bhi));
|
return avec(HIGH::mask_add(src.lo, mlo, a.lo, blo), HIGH::mask_add(src.hi, mhi, a.hi, bhi));
|
||||||
}
|
}
|
||||||
|
|
||||||
static typename HIGH::fscal acc_reduce_add(const avec &a) {
|
static typename HIGH::fscal acc_reduce_add(const avec &a) {
|
||||||
return HIGH::reduce_add(a.lo + a.hi);
|
return HIGH::reduce_add(a.lo + a.hi);
|
||||||
}
|
}
|
||||||
@ -2143,13 +2143,13 @@ template<class BASE_flt_t, class HIGH_flt_t, CalculationMode mic>
|
|||||||
struct AccumulatorTwiceMixinNone {
|
struct AccumulatorTwiceMixinNone {
|
||||||
typedef vector_ops<BASE_flt_t, mic> BASE;
|
typedef vector_ops<BASE_flt_t, mic> BASE;
|
||||||
typedef vector_ops<HIGH_flt_t, mic> HIGH;
|
typedef vector_ops<HIGH_flt_t, mic> HIGH;
|
||||||
|
|
||||||
typedef typename HIGH::fvec avec;
|
typedef typename HIGH::fvec avec;
|
||||||
typedef typename HIGH::fscal aarr[BASE::VL];
|
typedef typename HIGH::fscal aarr[BASE::VL];
|
||||||
|
|
||||||
static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) {
|
static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) {
|
||||||
return HIGH::mask_add(src, m, a, static_cast<typename HIGH::fvec>(b));
|
return HIGH::mask_add(src, m, a, static_cast<typename HIGH::fvec>(b));
|
||||||
}
|
}
|
||||||
static typename HIGH::fscal acc_reduce_add(const avec &a) {
|
static typename HIGH::fscal acc_reduce_add(const avec &a) {
|
||||||
return HIGH::reduce_add(a);
|
return HIGH::reduce_add(a);
|
||||||
}
|
}
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -18,110 +18,110 @@
|
|||||||
#ifndef LMP_MATH_EXTRA_INTEL_H
|
#ifndef LMP_MATH_EXTRA_INTEL_H
|
||||||
#define LMP_MATH_EXTRA_INTEL_H
|
#define LMP_MATH_EXTRA_INTEL_H
|
||||||
|
|
||||||
#define ME_quat_to_mat_trans(quat, mat) \
|
#define ME_quat_to_mat_trans(quat, mat) \
|
||||||
{ \
|
{ \
|
||||||
flt_t quat_w = quat.w; \
|
flt_t quat_w = quat.w; \
|
||||||
flt_t quat_i = quat.i; \
|
flt_t quat_i = quat.i; \
|
||||||
flt_t quat_j = quat.j; \
|
flt_t quat_j = quat.j; \
|
||||||
flt_t quat_k = quat.k; \
|
flt_t quat_k = quat.k; \
|
||||||
flt_t w2 = quat_w * quat_w; \
|
flt_t w2 = quat_w * quat_w; \
|
||||||
flt_t i2 = quat_i * quat_i; \
|
flt_t i2 = quat_i * quat_i; \
|
||||||
flt_t j2 = quat_j * quat_j; \
|
flt_t j2 = quat_j * quat_j; \
|
||||||
flt_t k2 = quat_k * quat_k; \
|
flt_t k2 = quat_k * quat_k; \
|
||||||
flt_t twoij = (flt_t)2.0 * quat_i * quat_j; \
|
flt_t twoij = (flt_t)2.0 * quat_i * quat_j; \
|
||||||
flt_t twoik = (flt_t)2.0 * quat_i * quat_k; \
|
flt_t twoik = (flt_t)2.0 * quat_i * quat_k; \
|
||||||
flt_t twojk = (flt_t)2.0 * quat_j * quat_k; \
|
flt_t twojk = (flt_t)2.0 * quat_j * quat_k; \
|
||||||
flt_t twoiw = (flt_t)2.0 * quat_i * quat_w; \
|
flt_t twoiw = (flt_t)2.0 * quat_i * quat_w; \
|
||||||
flt_t twojw = (flt_t)2.0 * quat_j * quat_w; \
|
flt_t twojw = (flt_t)2.0 * quat_j * quat_w; \
|
||||||
flt_t twokw = (flt_t)2.0 * quat_k * quat_w; \
|
flt_t twokw = (flt_t)2.0 * quat_k * quat_w; \
|
||||||
\
|
\
|
||||||
mat##_0 = w2 + i2 - j2 - k2; \
|
mat##_0 = w2 + i2 - j2 - k2; \
|
||||||
mat##_3 = twoij - twokw; \
|
mat##_3 = twoij - twokw; \
|
||||||
mat##_6 = twojw + twoik; \
|
mat##_6 = twojw + twoik; \
|
||||||
\
|
\
|
||||||
mat##_1 = twoij + twokw; \
|
mat##_1 = twoij + twokw; \
|
||||||
mat##_4 = w2 - i2 + j2 - k2; \
|
mat##_4 = w2 - i2 + j2 - k2; \
|
||||||
mat##_7 = twojk - twoiw; \
|
mat##_7 = twojk - twoiw; \
|
||||||
\
|
\
|
||||||
mat##_2 = twoik - twojw; \
|
mat##_2 = twoik - twojw; \
|
||||||
mat##_5 = twojk + twoiw; \
|
mat##_5 = twojk + twoiw; \
|
||||||
mat##_8 = w2 - i2 - j2 + k2; \
|
mat##_8 = w2 - i2 - j2 + k2; \
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
diagonal matrix times a full matrix
|
diagonal matrix times a full matrix
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#define ME_diag_times3(d, m, ans) \
|
#define ME_diag_times3(d, m, ans) \
|
||||||
{ \
|
{ \
|
||||||
ans##_0 = d[0] * m##_0; \
|
ans##_0 = d[0] * m##_0; \
|
||||||
ans##_1 = d[0] * m##_1; \
|
ans##_1 = d[0] * m##_1; \
|
||||||
ans##_2 = d[0] * m##_2; \
|
ans##_2 = d[0] * m##_2; \
|
||||||
ans##_3 = d[1] * m##_3; \
|
ans##_3 = d[1] * m##_3; \
|
||||||
ans##_4 = d[1] * m##_4; \
|
ans##_4 = d[1] * m##_4; \
|
||||||
ans##_5 = d[1] * m##_5; \
|
ans##_5 = d[1] * m##_5; \
|
||||||
ans##_6 = d[2] * m##_6; \
|
ans##_6 = d[2] * m##_6; \
|
||||||
ans##_7 = d[2] * m##_7; \
|
ans##_7 = d[2] * m##_7; \
|
||||||
ans##_8 = d[2] * m##_8; \
|
ans##_8 = d[2] * m##_8; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define ME_diag_times3a(d, m, ans) \
|
#define ME_diag_times3a(d, m, ans) \
|
||||||
{ \
|
{ \
|
||||||
ans##_0 = d##_0 * m##_0; \
|
ans##_0 = d##_0 * m##_0; \
|
||||||
ans##_1 = d##_0 * m##_1; \
|
ans##_1 = d##_0 * m##_1; \
|
||||||
ans##_2 = d##_0 * m##_2; \
|
ans##_2 = d##_0 * m##_2; \
|
||||||
ans##_3 = d##_1 * m##_3; \
|
ans##_3 = d##_1 * m##_3; \
|
||||||
ans##_4 = d##_1 * m##_4; \
|
ans##_4 = d##_1 * m##_4; \
|
||||||
ans##_5 = d##_1 * m##_5; \
|
ans##_5 = d##_1 * m##_5; \
|
||||||
ans##_6 = d##_2 * m##_6; \
|
ans##_6 = d##_2 * m##_6; \
|
||||||
ans##_7 = d##_2 * m##_7; \
|
ans##_7 = d##_2 * m##_7; \
|
||||||
ans##_8 = d##_2 * m##_8; \
|
ans##_8 = d##_2 * m##_8; \
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
multiply the transpose of mat1 times mat2
|
multiply the transpose of mat1 times mat2
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#define ME_transpose_times3(m1, m2, ans) \
|
#define ME_transpose_times3(m1, m2, ans) \
|
||||||
{ \
|
{ \
|
||||||
ans##_0 = m1##_0*m2##_0 + m1##_3*m2##_3 + m1##_6*m2##_6; \
|
ans##_0 = m1##_0*m2##_0 + m1##_3*m2##_3 + m1##_6*m2##_6; \
|
||||||
ans##_1 = m1##_0*m2##_1 + m1##_3*m2##_4 + m1##_6*m2##_7; \
|
ans##_1 = m1##_0*m2##_1 + m1##_3*m2##_4 + m1##_6*m2##_7; \
|
||||||
ans##_2 = m1##_0*m2##_2 + m1##_3*m2##_5 + m1##_6*m2##_8; \
|
ans##_2 = m1##_0*m2##_2 + m1##_3*m2##_5 + m1##_6*m2##_8; \
|
||||||
ans##_3 = m1##_1*m2##_0 + m1##_4*m2##_3 + m1##_7*m2##_6; \
|
ans##_3 = m1##_1*m2##_0 + m1##_4*m2##_3 + m1##_7*m2##_6; \
|
||||||
ans##_4 = m1##_1*m2##_1 + m1##_4*m2##_4 + m1##_7*m2##_7; \
|
ans##_4 = m1##_1*m2##_1 + m1##_4*m2##_4 + m1##_7*m2##_7; \
|
||||||
ans##_5 = m1##_1*m2##_2 + m1##_4*m2##_5 + m1##_7*m2##_8; \
|
ans##_5 = m1##_1*m2##_2 + m1##_4*m2##_5 + m1##_7*m2##_8; \
|
||||||
ans##_6 = m1##_2*m2##_0 + m1##_5*m2##_3 + m1##_8*m2##_6; \
|
ans##_6 = m1##_2*m2##_0 + m1##_5*m2##_3 + m1##_8*m2##_6; \
|
||||||
ans##_7 = m1##_2*m2##_1 + m1##_5*m2##_4 + m1##_8*m2##_7; \
|
ans##_7 = m1##_2*m2##_1 + m1##_5*m2##_4 + m1##_8*m2##_7; \
|
||||||
ans##_8 = m1##_2*m2##_2 + m1##_5*m2##_5 + m1##_8*m2##_8; \
|
ans##_8 = m1##_2*m2##_2 + m1##_5*m2##_5 + m1##_8*m2##_8; \
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
normalize a vector, return in ans
|
normalize a vector, return in ans
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#define ME_normalize3(v0, v1, v2, ans) \
|
#define ME_normalize3(v0, v1, v2, ans) \
|
||||||
{ \
|
{ \
|
||||||
flt_t scale = (flt_t)1.0 / sqrt(v0*v0+v1*v1+v2*v2); \
|
flt_t scale = (flt_t)1.0 / sqrt(v0*v0+v1*v1+v2*v2); \
|
||||||
ans##_0 = v0 * scale; \
|
ans##_0 = v0 * scale; \
|
||||||
ans##_1 = v1 * scale; \
|
ans##_1 = v1 * scale; \
|
||||||
ans##_2 = v2 * scale; \
|
ans##_2 = v2 * scale; \
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
add two matrices
|
add two matrices
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#define ME_plus3(m1, m2, ans) \
|
#define ME_plus3(m1, m2, ans) \
|
||||||
{ \
|
{ \
|
||||||
ans##_0 = m1##_0 + m2##_0; \
|
ans##_0 = m1##_0 + m2##_0; \
|
||||||
ans##_1 = m1##_1 + m2##_1; \
|
ans##_1 = m1##_1 + m2##_1; \
|
||||||
ans##_2 = m1##_2 + m2##_2; \
|
ans##_2 = m1##_2 + m2##_2; \
|
||||||
ans##_3 = m1##_3 + m2##_3; \
|
ans##_3 = m1##_3 + m2##_3; \
|
||||||
ans##_4 = m1##_4 + m2##_4; \
|
ans##_4 = m1##_4 + m2##_4; \
|
||||||
ans##_5 = m1##_5 + m2##_5; \
|
ans##_5 = m1##_5 + m2##_5; \
|
||||||
ans##_6 = m1##_6 + m2##_6; \
|
ans##_6 = m1##_6 + m2##_6; \
|
||||||
ans##_7 = m1##_7 + m2##_7; \
|
ans##_7 = m1##_7 + m2##_7; \
|
||||||
ans##_8 = m1##_8 + m2##_8; \
|
ans##_8 = m1##_8 + m2##_8; \
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
@ -135,7 +135,7 @@
|
|||||||
determinant of a matrix
|
determinant of a matrix
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#define ME_det3(m) \
|
#define ME_det3(m) \
|
||||||
( m##_0 * m##_4 * m##_8 - m##_0 * m##_5 * m##_7 - \
|
( m##_0 * m##_4 * m##_8 - m##_0 * m##_5 * m##_7 - \
|
||||||
m##_3 * m##_1 * m##_8 + m##_3 * m##_2 * m##_7 + \
|
m##_3 * m##_1 * m##_8 + m##_3 * m##_2 * m##_7 + \
|
||||||
m##_6 * m##_1 * m##_5 - m##_6 * m##_2 * m##_4 )
|
m##_6 * m##_1 * m##_5 - m##_6 * m##_2 * m##_4 )
|
||||||
@ -144,8 +144,8 @@
|
|||||||
row vector times matrix
|
row vector times matrix
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#define ME_vecmat(v, m, ans) \
|
#define ME_vecmat(v, m, ans) \
|
||||||
{ \
|
{ \
|
||||||
ans##_0 = v##_0 * m##_0 + v##_1 * m##_3 + v##_2 * m##_6; \
|
ans##_0 = v##_0 * m##_0 + v##_1 * m##_3 + v##_2 * m##_6; \
|
||||||
ans##_1 = v##_0 * m##_1 + v##_1 * m##_4 + v##_2 * m##_7; \
|
ans##_1 = v##_0 * m##_1 + v##_1 * m##_4 + v##_2 * m##_7; \
|
||||||
ans##_2 = v##_0 * m##_2 + v##_1 * m##_5 + v##_2 * m##_8; \
|
ans##_2 = v##_0 * m##_2 + v##_1 * m##_5 + v##_2 * m##_8; \
|
||||||
@ -155,214 +155,214 @@
|
|||||||
cross product of 2 vectors
|
cross product of 2 vectors
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#define ME_cross3(v1, v2, ans) \
|
#define ME_cross3(v1, v2, ans) \
|
||||||
{ \
|
{ \
|
||||||
ans##_0 = v1##_1 * v2##_2 - v1##_2 * v2##_1; \
|
ans##_0 = v1##_1 * v2##_2 - v1##_2 * v2##_1; \
|
||||||
ans##_1 = v1##_2 * v2##_0 - v1##_0 * v2##_2; \
|
ans##_1 = v1##_2 * v2##_0 - v1##_0 * v2##_2; \
|
||||||
ans##_2 = v1##_0 * v2##_1 - v1##_1 * v2##_0; \
|
ans##_2 = v1##_0 * v2##_1 - v1##_1 * v2##_0; \
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
cross product of 2 vectors
|
cross product of 2 vectors
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#define ME_mv0_cross3(m1, v2, ans) \
|
#define ME_mv0_cross3(m1, v2, ans) \
|
||||||
{ \
|
{ \
|
||||||
ans##_0 = m1##_1 * v2##_2 - m1##_2 * v2##_1; \
|
ans##_0 = m1##_1 * v2##_2 - m1##_2 * v2##_1; \
|
||||||
ans##_1 = m1##_2 * v2##_0 - m1##_0 * v2##_2; \
|
ans##_1 = m1##_2 * v2##_0 - m1##_0 * v2##_2; \
|
||||||
ans##_2 = m1##_0 * v2##_1 - m1##_1 * v2##_0; \
|
ans##_2 = m1##_0 * v2##_1 - m1##_1 * v2##_0; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define ME_mv1_cross3(m1, v2, ans) \
|
#define ME_mv1_cross3(m1, v2, ans) \
|
||||||
{ \
|
{ \
|
||||||
ans##_0 = m1##_4 * v2##_2 - m1##_5 * v2##_1; \
|
ans##_0 = m1##_4 * v2##_2 - m1##_5 * v2##_1; \
|
||||||
ans##_1 = m1##_5 * v2##_0 - m1##_3 * v2##_2; \
|
ans##_1 = m1##_5 * v2##_0 - m1##_3 * v2##_2; \
|
||||||
ans##_2 = m1##_3 * v2##_1 - m1##_4 * v2##_0; \
|
ans##_2 = m1##_3 * v2##_1 - m1##_4 * v2##_0; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define ME_mv2_cross3(m1, v2, ans) \
|
#define ME_mv2_cross3(m1, v2, ans) \
|
||||||
{ \
|
{ \
|
||||||
ans##_0 = m1##_7 * v2##_2 - m1##_8 * v2##_1; \
|
ans##_0 = m1##_7 * v2##_2 - m1##_8 * v2##_1; \
|
||||||
ans##_1 = m1##_8 * v2##_0 - m1##_6 * v2##_2; \
|
ans##_1 = m1##_8 * v2##_0 - m1##_6 * v2##_2; \
|
||||||
ans##_2 = m1##_6 * v2##_1 - m1##_7 * v2##_0; \
|
ans##_2 = m1##_6 * v2##_1 - m1##_7 * v2##_0; \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#define ME_compute_eta_torque(m1, m2, s1, ans) \
|
#define ME_compute_eta_torque(m1, m2, s1, ans) \
|
||||||
{ \
|
{ \
|
||||||
flt_t den = m1##_3*m1##_2*m1##_7-m1##_0*m1##_5*m1##_7- \
|
flt_t den = m1##_3*m1##_2*m1##_7-m1##_0*m1##_5*m1##_7- \
|
||||||
m1##_2*m1##_6*m1##_4+m1##_1*m1##_6*m1##_5- \
|
m1##_2*m1##_6*m1##_4+m1##_1*m1##_6*m1##_5- \
|
||||||
m1##_3*m1##_1*m1##_8+m1##_0*m1##_4*m1##_8; \
|
m1##_3*m1##_1*m1##_8+m1##_0*m1##_4*m1##_8; \
|
||||||
den = (flt_t)1.0 / den; \
|
den = (flt_t)1.0 / den; \
|
||||||
\
|
\
|
||||||
ans##_0 = s1##_0*(m1##_5*m1##_1*m2##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_0- \
|
ans##_0 = s1##_0*(m1##_5*m1##_1*m2##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_0- \
|
||||||
m1##_4*m2##_2*m1##_2-(flt_t)2.0*m1##_5*m2##_0*m1##_7+ \
|
m1##_4*m2##_2*m1##_2-(flt_t)2.0*m1##_5*m2##_0*m1##_7+ \
|
||||||
m2##_1*m1##_2*m1##_7-m2##_1*m1##_1*m1##_8- \
|
m2##_1*m1##_2*m1##_7-m2##_1*m1##_1*m1##_8- \
|
||||||
m1##_3*m1##_8*m2##_1+m1##_6*m1##_5*m2##_1+ \
|
m1##_3*m1##_8*m2##_1+m1##_6*m1##_5*m2##_1+ \
|
||||||
m1##_3*m2##_2*m1##_7-m2##_2*m1##_6*m1##_4)*den; \
|
m1##_3*m2##_2*m1##_7-m2##_2*m1##_6*m1##_4)*den; \
|
||||||
\
|
\
|
||||||
ans##_1 = s1##_0*(m1##_2*m2##_0*m1##_7-m1##_8*m2##_0*m1##_1+ \
|
ans##_1 = s1##_0*(m1##_2*m2##_0*m1##_7-m1##_8*m2##_0*m1##_1+ \
|
||||||
(flt_t)2.0*m1##_0*m1##_8*m2##_1-m1##_0*m2##_2*m1##_5- \
|
(flt_t)2.0*m1##_0*m1##_8*m2##_1-m1##_0*m2##_2*m1##_5- \
|
||||||
(flt_t)2.0*m1##_6*m1##_2*m2##_1+m2##_2*m1##_3*m1##_2- \
|
(flt_t)2.0*m1##_6*m1##_2*m2##_1+m2##_2*m1##_3*m1##_2- \
|
||||||
m1##_8*m1##_3*m2##_0+m1##_6*m2##_0*m1##_5+ \
|
m1##_8*m1##_3*m2##_0+m1##_6*m2##_0*m1##_5+ \
|
||||||
m1##_6*m2##_2*m1##_1-m2##_2*m1##_0*m1##_7)*den; \
|
m1##_6*m2##_2*m1##_1-m2##_2*m1##_0*m1##_7)*den; \
|
||||||
\
|
\
|
||||||
ans##_2 = s1##_0*(m1##_1*m1##_5*m2##_0-m1##_2*m2##_0*m1##_4- \
|
ans##_2 = s1##_0*(m1##_1*m1##_5*m2##_0-m1##_2*m2##_0*m1##_4- \
|
||||||
m1##_0*m1##_5*m2##_1+m1##_3*m1##_2*m2##_1- \
|
m1##_0*m1##_5*m2##_1+m1##_3*m1##_2*m2##_1- \
|
||||||
m2##_1*m1##_0*m1##_7-m1##_6*m1##_4*m2##_0+ \
|
m2##_1*m1##_0*m1##_7-m1##_6*m1##_4*m2##_0+ \
|
||||||
(flt_t)2.0*m1##_4*m1##_0*m2##_2- \
|
(flt_t)2.0*m1##_4*m1##_0*m2##_2- \
|
||||||
(flt_t)2.0*m1##_3*m2##_2*m1##_1+ \
|
(flt_t)2.0*m1##_3*m2##_2*m1##_1+ \
|
||||||
m1##_3*m1##_7*m2##_0+m1##_6*m2##_1*m1##_1)*den; \
|
m1##_3*m1##_7*m2##_0+m1##_6*m2##_1*m1##_1)*den; \
|
||||||
\
|
\
|
||||||
ans##_3 = s1##_1*(-m1##_4*m2##_5*m1##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_3+ \
|
ans##_3 = s1##_1*(-m1##_4*m2##_5*m1##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_3+ \
|
||||||
m1##_5*m1##_1*m2##_5-(flt_t)2.0*m1##_5*m2##_3*m1##_7+ \
|
m1##_5*m1##_1*m2##_5-(flt_t)2.0*m1##_5*m2##_3*m1##_7+ \
|
||||||
m2##_4*m1##_2*m1##_7-m2##_4*m1##_1*m1##_8- \
|
m2##_4*m1##_2*m1##_7-m2##_4*m1##_1*m1##_8- \
|
||||||
m1##_3*m1##_8*m2##_4+m1##_6*m1##_5*m2##_4- \
|
m1##_3*m1##_8*m2##_4+m1##_6*m1##_5*m2##_4- \
|
||||||
m2##_5*m1##_6*m1##_4+m1##_3*m2##_5*m1##_7)*den; \
|
m2##_5*m1##_6*m1##_4+m1##_3*m2##_5*m1##_7)*den; \
|
||||||
\
|
\
|
||||||
ans##_4 = s1##_1*(m1##_2*m2##_3*m1##_7-m1##_1*m1##_8*m2##_3+ \
|
ans##_4 = s1##_1*(m1##_2*m2##_3*m1##_7-m1##_1*m1##_8*m2##_3+ \
|
||||||
(flt_t)2.0*m1##_8*m1##_0*m2##_4-m2##_5*m1##_0*m1##_5- \
|
(flt_t)2.0*m1##_8*m1##_0*m2##_4-m2##_5*m1##_0*m1##_5- \
|
||||||
(flt_t)2.0*m1##_6*m2##_4*m1##_2-m1##_3*m1##_8*m2##_3+ \
|
(flt_t)2.0*m1##_6*m2##_4*m1##_2-m1##_3*m1##_8*m2##_3+ \
|
||||||
m1##_6*m1##_5*m2##_3+m1##_3*m2##_5*m1##_2- \
|
m1##_6*m1##_5*m2##_3+m1##_3*m2##_5*m1##_2- \
|
||||||
m1##_0*m2##_5*m1##_7+m2##_5*m1##_1*m1##_6)*den; \
|
m1##_0*m2##_5*m1##_7+m2##_5*m1##_1*m1##_6)*den; \
|
||||||
\
|
\
|
||||||
ans##_5 = s1##_1*(m1##_1*m1##_5*m2##_3-m1##_2*m2##_3*m1##_4- \
|
ans##_5 = s1##_1*(m1##_1*m1##_5*m2##_3-m1##_2*m2##_3*m1##_4- \
|
||||||
m1##_0*m1##_5*m2##_4+m1##_3*m1##_2*m2##_4+ \
|
m1##_0*m1##_5*m2##_4+m1##_3*m1##_2*m2##_4+ \
|
||||||
(flt_t)2.0*m1##_4*m1##_0*m2##_5-m1##_0*m2##_4*m1##_7+ \
|
(flt_t)2.0*m1##_4*m1##_0*m2##_5-m1##_0*m2##_4*m1##_7+ \
|
||||||
m1##_1*m1##_6*m2##_4-m2##_3*m1##_6*m1##_4- \
|
m1##_1*m1##_6*m2##_4-m2##_3*m1##_6*m1##_4- \
|
||||||
(flt_t)2.0*m1##_3*m1##_1*m2##_5+m1##_3*m2##_3*m1##_7)* \
|
(flt_t)2.0*m1##_3*m1##_1*m2##_5+m1##_3*m2##_3*m1##_7)* \
|
||||||
den; \
|
den; \
|
||||||
\
|
\
|
||||||
ans##_6 = s1##_2*(-m1##_4*m1##_2*m2##_8+m1##_1*m1##_5*m2##_8+ \
|
ans##_6 = s1##_2*(-m1##_4*m1##_2*m2##_8+m1##_1*m1##_5*m2##_8+ \
|
||||||
(flt_t)2.0*m1##_4*m2##_6*m1##_8-m1##_1*m2##_7*m1##_8+ \
|
(flt_t)2.0*m1##_4*m2##_6*m1##_8-m1##_1*m2##_7*m1##_8+ \
|
||||||
m1##_2*m1##_7*m2##_7-(flt_t)2.0*m2##_6*m1##_7*m1##_5- \
|
m1##_2*m1##_7*m2##_7-(flt_t)2.0*m2##_6*m1##_7*m1##_5- \
|
||||||
m1##_3*m2##_7*m1##_8+m1##_5*m1##_6*m2##_7- \
|
m1##_3*m2##_7*m1##_8+m1##_5*m1##_6*m2##_7- \
|
||||||
m1##_4*m1##_6*m2##_8+m1##_7*m1##_3*m2##_8)*den; \
|
m1##_4*m1##_6*m2##_8+m1##_7*m1##_3*m2##_8)*den; \
|
||||||
\
|
\
|
||||||
ans##_7 = s1##_2*-(m1##_1*m1##_8*m2##_6-m1##_2*m2##_6*m1##_7- \
|
ans##_7 = s1##_2*-(m1##_1*m1##_8*m2##_6-m1##_2*m2##_6*m1##_7- \
|
||||||
(flt_t)2.0*m2##_7*m1##_0*m1##_8+m1##_5*m2##_8*m1##_0+ \
|
(flt_t)2.0*m2##_7*m1##_0*m1##_8+m1##_5*m2##_8*m1##_0+ \
|
||||||
(flt_t)2.0*m2##_7*m1##_2*m1##_6+m1##_3*m2##_6*m1##_8- \
|
(flt_t)2.0*m2##_7*m1##_2*m1##_6+m1##_3*m2##_6*m1##_8- \
|
||||||
m1##_3*m1##_2*m2##_8-m1##_5*m1##_6*m2##_6+ \
|
m1##_3*m1##_2*m2##_8-m1##_5*m1##_6*m2##_6+ \
|
||||||
m1##_0*m2##_8*m1##_7-m2##_8*m1##_1*m1##_6)*den; \
|
m1##_0*m2##_8*m1##_7-m2##_8*m1##_1*m1##_6)*den; \
|
||||||
\
|
\
|
||||||
ans##_8 = s1##_2*(m1##_1*m1##_5*m2##_6-m1##_2*m2##_6*m1##_4- \
|
ans##_8 = s1##_2*(m1##_1*m1##_5*m2##_6-m1##_2*m2##_6*m1##_4- \
|
||||||
m1##_0*m1##_5*m2##_7+m1##_3*m1##_2*m2##_7- \
|
m1##_0*m1##_5*m2##_7+m1##_3*m1##_2*m2##_7- \
|
||||||
m1##_4*m1##_6*m2##_6-m1##_7*m2##_7*m1##_0+ \
|
m1##_4*m1##_6*m2##_6-m1##_7*m2##_7*m1##_0+ \
|
||||||
(flt_t)2.0*m1##_4*m2##_8*m1##_0+m1##_7*m1##_3*m2##_6+ \
|
(flt_t)2.0*m1##_4*m2##_8*m1##_0+m1##_7*m1##_3*m2##_6+ \
|
||||||
m1##_6*m1##_1*m2##_7-(flt_t)2.0*m2##_8*m1##_3*m1##_1)* \
|
m1##_6*m1##_1*m2##_7-(flt_t)2.0*m2##_8*m1##_3*m1##_1)* \
|
||||||
den; \
|
den; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define ME_vcopy4(dst,src) \
|
#define ME_vcopy4(dst,src) \
|
||||||
dst##_0 = src##_0; \
|
dst##_0 = src##_0; \
|
||||||
dst##_1 = src##_1; \
|
dst##_1 = src##_1; \
|
||||||
dst##_2 = src##_2; \
|
dst##_2 = src##_2; \
|
||||||
dst##_3 = src##_3;
|
dst##_3 = src##_3;
|
||||||
|
|
||||||
#define ME_mldivide3(m1, v_0, v_1, v_2, ans, error) \
|
#define ME_mldivide3(m1, v_0, v_1, v_2, ans, error) \
|
||||||
{ \
|
{ \
|
||||||
flt_t aug_0, aug_1, aug_2, aug_3, aug_4, aug_5; \
|
flt_t aug_0, aug_1, aug_2, aug_3, aug_4, aug_5; \
|
||||||
flt_t aug_6, aug_7, aug_8, aug_9, aug_10, aug_11, t; \
|
flt_t aug_6, aug_7, aug_8, aug_9, aug_10, aug_11, t; \
|
||||||
\
|
\
|
||||||
aug_3 = v_0; \
|
aug_3 = v_0; \
|
||||||
aug_0 = m1##_0; \
|
aug_0 = m1##_0; \
|
||||||
aug_1 = m1##_1; \
|
aug_1 = m1##_1; \
|
||||||
aug_2 = m1##_2; \
|
aug_2 = m1##_2; \
|
||||||
aug_7 = v_1; \
|
aug_7 = v_1; \
|
||||||
aug_4 = m1##_3; \
|
aug_4 = m1##_3; \
|
||||||
aug_5 = m1##_4; \
|
aug_5 = m1##_4; \
|
||||||
aug_6 = m1##_5; \
|
aug_6 = m1##_5; \
|
||||||
aug_11 = v_2; \
|
aug_11 = v_2; \
|
||||||
aug_8 = m1##_6; \
|
aug_8 = m1##_6; \
|
||||||
aug_9 = m1##_7; \
|
aug_9 = m1##_7; \
|
||||||
aug_10 = m1##_8; \
|
aug_10 = m1##_8; \
|
||||||
\
|
\
|
||||||
if (fabs(aug_4) > fabs(aug_0)) { \
|
if (fabs(aug_4) > fabs(aug_0)) { \
|
||||||
flt_t swapt; \
|
flt_t swapt; \
|
||||||
swapt = aug_0; aug_0 = aug_4; aug_4 = swapt; \
|
swapt = aug_0; aug_0 = aug_4; aug_4 = swapt; \
|
||||||
swapt = aug_1; aug_1 = aug_5; aug_5 = swapt; \
|
swapt = aug_1; aug_1 = aug_5; aug_5 = swapt; \
|
||||||
swapt = aug_2; aug_2 = aug_6; aug_6 = swapt; \
|
swapt = aug_2; aug_2 = aug_6; aug_6 = swapt; \
|
||||||
swapt = aug_3; aug_3 = aug_7; aug_7 = swapt; \
|
swapt = aug_3; aug_3 = aug_7; aug_7 = swapt; \
|
||||||
} \
|
} \
|
||||||
if (fabs(aug_8) > fabs(aug_0)) { \
|
if (fabs(aug_8) > fabs(aug_0)) { \
|
||||||
flt_t swapt; \
|
flt_t swapt; \
|
||||||
swapt = aug_0; aug_0 = aug_8; aug_8 = swapt; \
|
swapt = aug_0; aug_0 = aug_8; aug_8 = swapt; \
|
||||||
swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \
|
swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \
|
||||||
swapt = aug_2; aug_2 = aug_10; aug_10 = swapt; \
|
swapt = aug_2; aug_2 = aug_10; aug_10 = swapt; \
|
||||||
swapt = aug_3; aug_3 = aug_11; aug_11 = swapt; \
|
swapt = aug_3; aug_3 = aug_11; aug_11 = swapt; \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
if (aug_0 != (flt_t)0.0) { \
|
if (aug_0 != (flt_t)0.0) { \
|
||||||
} else if (aug_4 != (flt_t)0.0) { \
|
} else if (aug_4 != (flt_t)0.0) { \
|
||||||
flt_t swapt; \
|
flt_t swapt; \
|
||||||
swapt = aug_0; aug_0 = aug_4; aug_4 = swapt; \
|
swapt = aug_0; aug_0 = aug_4; aug_4 = swapt; \
|
||||||
swapt = aug_1; aug_1 = aug_5; aug_5 = swapt; \
|
swapt = aug_1; aug_1 = aug_5; aug_5 = swapt; \
|
||||||
swapt = aug_2; aug_2 = aug_6; aug_6 = swapt; \
|
swapt = aug_2; aug_2 = aug_6; aug_6 = swapt; \
|
||||||
swapt = aug_3; aug_3 = aug_7; aug_7 = swapt; \
|
swapt = aug_3; aug_3 = aug_7; aug_7 = swapt; \
|
||||||
} else if (aug_8 != (flt_t)0.0) { \
|
} else if (aug_8 != (flt_t)0.0) { \
|
||||||
flt_t swapt; \
|
flt_t swapt; \
|
||||||
swapt = aug_0; aug_0 = aug_8; aug_8 = swapt; \
|
swapt = aug_0; aug_0 = aug_8; aug_8 = swapt; \
|
||||||
swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \
|
swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \
|
||||||
swapt = aug_2; aug_2 = aug_10; aug_10 = swapt; \
|
swapt = aug_2; aug_2 = aug_10; aug_10 = swapt; \
|
||||||
swapt = aug_3; aug_3 = aug_11; aug_11 = swapt; \
|
swapt = aug_3; aug_3 = aug_11; aug_11 = swapt; \
|
||||||
} else \
|
} else \
|
||||||
error = 1; \
|
error = 1; \
|
||||||
\
|
\
|
||||||
t = aug_4 / aug_0; \
|
t = aug_4 / aug_0; \
|
||||||
aug_5 -= t * aug_1; \
|
aug_5 -= t * aug_1; \
|
||||||
aug_6 -= t * aug_2; \
|
aug_6 -= t * aug_2; \
|
||||||
aug_7 -= t * aug_3; \
|
aug_7 -= t * aug_3; \
|
||||||
t = aug_8 / aug_0; \
|
t = aug_8 / aug_0; \
|
||||||
aug_9 -= t * aug_1; \
|
aug_9 -= t * aug_1; \
|
||||||
aug_10 -= t * aug_2; \
|
aug_10 -= t * aug_2; \
|
||||||
aug_11 -= t * aug_3; \
|
aug_11 -= t * aug_3; \
|
||||||
\
|
\
|
||||||
if (fabs(aug_9) > fabs(aug_5)) { \
|
if (fabs(aug_9) > fabs(aug_5)) { \
|
||||||
flt_t swapt; \
|
flt_t swapt; \
|
||||||
swapt = aug_4; aug_4 = aug_8; aug_8 = swapt; \
|
|
||||||
swapt = aug_5; aug_5 = aug_9; aug_9 = swapt; \
|
|
||||||
swapt = aug_6; aug_6 = aug_10; aug_10 = swapt; \
|
|
||||||
swapt = aug_7; aug_7 = aug_11; aug_11 = swapt; \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
if (aug_5 != (flt_t)0.0) { \
|
|
||||||
} else if (aug_9 != (flt_t)0.0) { \
|
|
||||||
flt_t swapt; \
|
|
||||||
swapt = aug_4; aug_4 = aug_8; aug_8 = swapt; \
|
swapt = aug_4; aug_4 = aug_8; aug_8 = swapt; \
|
||||||
swapt = aug_5; aug_5 = aug_9; aug_9 = swapt; \
|
swapt = aug_5; aug_5 = aug_9; aug_9 = swapt; \
|
||||||
swapt = aug_6; aug_6 = aug_10; aug_10 = swapt; \
|
swapt = aug_6; aug_6 = aug_10; aug_10 = swapt; \
|
||||||
swapt = aug_7; aug_7 = aug_11; aug_11 = swapt; \
|
swapt = aug_7; aug_7 = aug_11; aug_11 = swapt; \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
t = aug_9 / aug_5; \
|
if (aug_5 != (flt_t)0.0) { \
|
||||||
aug_10 -= t * aug_6; \
|
} else if (aug_9 != (flt_t)0.0) { \
|
||||||
aug_11 -= t * aug_7; \
|
flt_t swapt; \
|
||||||
\
|
swapt = aug_4; aug_4 = aug_8; aug_8 = swapt; \
|
||||||
if (aug_10 == (flt_t)0.0) \
|
swapt = aug_5; aug_5 = aug_9; aug_9 = swapt; \
|
||||||
error = 1; \
|
swapt = aug_6; aug_6 = aug_10; aug_10 = swapt; \
|
||||||
\
|
swapt = aug_7; aug_7 = aug_11; aug_11 = swapt; \
|
||||||
ans##_2 = aug_11/aug_10; \
|
} \
|
||||||
t = (flt_t)0.0; \
|
\
|
||||||
t += aug_6 * ans##_2; \
|
t = aug_9 / aug_5; \
|
||||||
ans##_1 = (aug_7-t) / aug_5; \
|
aug_10 -= t * aug_6; \
|
||||||
t = (flt_t)0.0; \
|
aug_11 -= t * aug_7; \
|
||||||
t += aug_1 * ans##_1; \
|
\
|
||||||
t += aug_2 * ans##_2; \
|
if (aug_10 == (flt_t)0.0) \
|
||||||
ans##_0 = (aug_3 - t) / aug_0; \
|
error = 1; \
|
||||||
|
\
|
||||||
|
ans##_2 = aug_11/aug_10; \
|
||||||
|
t = (flt_t)0.0; \
|
||||||
|
t += aug_6 * ans##_2; \
|
||||||
|
ans##_1 = (aug_7-t) / aug_5; \
|
||||||
|
t = (flt_t)0.0; \
|
||||||
|
t += aug_1 * ans##_1; \
|
||||||
|
t += aug_2 * ans##_2; \
|
||||||
|
ans##_0 = (aug_3 - t) / aug_0; \
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
normalize a quaternion
|
normalize a quaternion
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#define ME_qnormalize(q) \
|
#define ME_qnormalize(q) \
|
||||||
{ \
|
{ \
|
||||||
double norm = 1.0 / \
|
double norm = 1.0 / \
|
||||||
sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k); \
|
sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k); \
|
||||||
q##_w *= norm; \
|
q##_w *= norm; \
|
||||||
q##_i *= norm; \
|
q##_i *= norm; \
|
||||||
q##_j *= norm; \
|
q##_j *= norm; \
|
||||||
q##_k *= norm; \
|
q##_k *= norm; \
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
@ -373,106 +373,106 @@
|
|||||||
and divide by principal moments
|
and divide by principal moments
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w) \
|
#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w) \
|
||||||
{ \
|
{ \
|
||||||
double wbody_0, wbody_1, wbody_2; \
|
double wbody_0, wbody_1, wbody_2; \
|
||||||
double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8; \
|
double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8; \
|
||||||
\
|
\
|
||||||
double w2 = quat##_w * quat##_w; \
|
double w2 = quat##_w * quat##_w; \
|
||||||
double i2 = quat##_i * quat##_i; \
|
double i2 = quat##_i * quat##_i; \
|
||||||
double j2 = quat##_j * quat##_j; \
|
double j2 = quat##_j * quat##_j; \
|
||||||
double k2 = quat##_k * quat##_k; \
|
double k2 = quat##_k * quat##_k; \
|
||||||
double twoij = 2.0 * quat##_i * quat##_j; \
|
double twoij = 2.0 * quat##_i * quat##_j; \
|
||||||
double twoik = 2.0 * quat##_i * quat##_k; \
|
double twoik = 2.0 * quat##_i * quat##_k; \
|
||||||
double twojk = 2.0 * quat##_j * quat##_k; \
|
double twojk = 2.0 * quat##_j * quat##_k; \
|
||||||
double twoiw = 2.0 * quat##_i * quat##_w; \
|
double twoiw = 2.0 * quat##_i * quat##_w; \
|
||||||
double twojw = 2.0 * quat##_j * quat##_w; \
|
double twojw = 2.0 * quat##_j * quat##_w; \
|
||||||
double twokw = 2.0 * quat##_k * quat##_w; \
|
double twokw = 2.0 * quat##_k * quat##_w; \
|
||||||
\
|
\
|
||||||
rot##_0 = w2 + i2 - j2 - k2; \
|
rot##_0 = w2 + i2 - j2 - k2; \
|
||||||
rot##_1 = twoij - twokw; \
|
rot##_1 = twoij - twokw; \
|
||||||
rot##_2 = twojw + twoik; \
|
rot##_2 = twojw + twoik; \
|
||||||
\
|
\
|
||||||
rot##_3 = twoij + twokw; \
|
rot##_3 = twoij + twokw; \
|
||||||
rot##_4 = w2 - i2 + j2 - k2; \
|
rot##_4 = w2 - i2 + j2 - k2; \
|
||||||
rot##_5 = twojk - twoiw; \
|
rot##_5 = twojk - twoiw; \
|
||||||
\
|
\
|
||||||
rot##_6 = twoik - twojw; \
|
rot##_6 = twoik - twojw; \
|
||||||
rot##_7 = twojk + twoiw; \
|
rot##_7 = twojk + twoiw; \
|
||||||
rot##_8 = w2 - i2 - j2 + k2; \
|
rot##_8 = w2 - i2 - j2 + k2; \
|
||||||
\
|
\
|
||||||
wbody_0 = rot##_0*m##_0 + rot##_3*m##_1 + rot##_6*m##_2; \
|
wbody_0 = rot##_0*m##_0 + rot##_3*m##_1 + rot##_6*m##_2; \
|
||||||
wbody_1 = rot##_1*m##_0 + rot##_4*m##_1 + rot##_7*m##_2; \
|
wbody_1 = rot##_1*m##_0 + rot##_4*m##_1 + rot##_7*m##_2; \
|
||||||
wbody_2 = rot##_2*m##_0 + rot##_5*m##_1 + rot##_8*m##_2; \
|
wbody_2 = rot##_2*m##_0 + rot##_5*m##_1 + rot##_8*m##_2; \
|
||||||
\
|
\
|
||||||
wbody_0 *= moments_0; \
|
wbody_0 *= moments_0; \
|
||||||
wbody_1 *= moments_1; \
|
wbody_1 *= moments_1; \
|
||||||
wbody_2 *= moments_2; \
|
wbody_2 *= moments_2; \
|
||||||
\
|
\
|
||||||
w##_0 = rot##_0*wbody_0 + rot##_1*wbody_1 + rot##_2*wbody_2; \
|
w##_0 = rot##_0*wbody_0 + rot##_1*wbody_1 + rot##_2*wbody_2; \
|
||||||
w##_1 = rot##_3*wbody_0 + rot##_4*wbody_1 + rot##_5*wbody_2; \
|
w##_1 = rot##_3*wbody_0 + rot##_4*wbody_1 + rot##_5*wbody_2; \
|
||||||
w##_2 = rot##_6*wbody_0 + rot##_7*wbody_1 + rot##_8*wbody_2; \
|
w##_2 = rot##_6*wbody_0 + rot##_7*wbody_1 + rot##_8*wbody_2; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2) \
|
#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2) \
|
||||||
{ \
|
{ \
|
||||||
angmomin[0] += dtf * torque[0]; \
|
angmomin[0] += dtf * torque[0]; \
|
||||||
double angmom_0 = angmomin[0]; \
|
double angmom_0 = angmomin[0]; \
|
||||||
angmomin[1] += dtf * torque[1]; \
|
angmomin[1] += dtf * torque[1]; \
|
||||||
double angmom_1 = angmomin[1]; \
|
double angmom_1 = angmomin[1]; \
|
||||||
angmomin[2] += dtf * torque[2]; \
|
angmomin[2] += dtf * torque[2]; \
|
||||||
double angmom_2 = angmomin[2]; \
|
double angmom_2 = angmomin[2]; \
|
||||||
\
|
\
|
||||||
double quat_w = quatin[0]; \
|
double quat_w = quatin[0]; \
|
||||||
double quat_i = quatin[1]; \
|
double quat_i = quatin[1]; \
|
||||||
double quat_j = quatin[2]; \
|
double quat_j = quatin[2]; \
|
||||||
double quat_k = quatin[3]; \
|
double quat_k = quatin[3]; \
|
||||||
\
|
\
|
||||||
double omega_0, omega_1, omega_2; \
|
double omega_0, omega_1, omega_2; \
|
||||||
ME_mq_to_omega(angmom,quat,i0,i1,i2,omega); \
|
ME_mq_to_omega(angmom,quat,i0,i1,i2,omega); \
|
||||||
\
|
\
|
||||||
double wq_0, wq_1, wq_2, wq_3; \
|
double wq_0, wq_1, wq_2, wq_3; \
|
||||||
wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k; \
|
wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k; \
|
||||||
wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j; \
|
wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j; \
|
||||||
wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k; \
|
wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k; \
|
||||||
wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i; \
|
wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i; \
|
||||||
\
|
\
|
||||||
double qfull_w, qfull_i, qfull_j, qfull_k; \
|
double qfull_w, qfull_i, qfull_j, qfull_k; \
|
||||||
qfull_w = quat_w + dtq * wq_0; \
|
qfull_w = quat_w + dtq * wq_0; \
|
||||||
qfull_i = quat_i + dtq * wq_1; \
|
qfull_i = quat_i + dtq * wq_1; \
|
||||||
qfull_j = quat_j + dtq * wq_2; \
|
qfull_j = quat_j + dtq * wq_2; \
|
||||||
qfull_k = quat_k + dtq * wq_3; \
|
qfull_k = quat_k + dtq * wq_3; \
|
||||||
ME_qnormalize(qfull); \
|
ME_qnormalize(qfull); \
|
||||||
\
|
\
|
||||||
double qhalf_w, qhalf_i, qhalf_j, qhalf_k; \
|
double qhalf_w, qhalf_i, qhalf_j, qhalf_k; \
|
||||||
qhalf_w = quat_w + 0.5*dtq * wq_0; \
|
qhalf_w = quat_w + 0.5*dtq * wq_0; \
|
||||||
qhalf_i = quat_i + 0.5*dtq * wq_1; \
|
qhalf_i = quat_i + 0.5*dtq * wq_1; \
|
||||||
qhalf_j = quat_j + 0.5*dtq * wq_2; \
|
qhalf_j = quat_j + 0.5*dtq * wq_2; \
|
||||||
qhalf_k = quat_k + 0.5*dtq * wq_3; \
|
qhalf_k = quat_k + 0.5*dtq * wq_3; \
|
||||||
ME_qnormalize(qhalf); \
|
ME_qnormalize(qhalf); \
|
||||||
\
|
\
|
||||||
ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega); \
|
ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega); \
|
||||||
wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k; \
|
wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k; \
|
||||||
wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j; \
|
wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j; \
|
||||||
wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k; \
|
wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k; \
|
||||||
wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i; \
|
wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i; \
|
||||||
\
|
\
|
||||||
qhalf_w += 0.5*dtq * wq_0; \
|
qhalf_w += 0.5*dtq * wq_0; \
|
||||||
qhalf_i += 0.5*dtq * wq_1; \
|
qhalf_i += 0.5*dtq * wq_1; \
|
||||||
qhalf_j += 0.5*dtq * wq_2; \
|
qhalf_j += 0.5*dtq * wq_2; \
|
||||||
qhalf_k += 0.5*dtq * wq_3; \
|
qhalf_k += 0.5*dtq * wq_3; \
|
||||||
ME_qnormalize(qhalf); \
|
ME_qnormalize(qhalf); \
|
||||||
\
|
\
|
||||||
quat_w = 2.0*qhalf_w - qfull_w; \
|
quat_w = 2.0*qhalf_w - qfull_w; \
|
||||||
quat_i = 2.0*qhalf_i - qfull_i; \
|
quat_i = 2.0*qhalf_i - qfull_i; \
|
||||||
quat_j = 2.0*qhalf_j - qfull_j; \
|
quat_j = 2.0*qhalf_j - qfull_j; \
|
||||||
quat_k = 2.0*qhalf_k - qfull_k; \
|
quat_k = 2.0*qhalf_k - qfull_k; \
|
||||||
ME_qnormalize(quat); \
|
ME_qnormalize(quat); \
|
||||||
\
|
\
|
||||||
quatin[0] = quat_w; \
|
quatin[0] = quat_w; \
|
||||||
quatin[1] = quat_i; \
|
quatin[1] = quat_i; \
|
||||||
quatin[2] = quat_j; \
|
quatin[2] = quat_j; \
|
||||||
quatin[3] = quat_k; \
|
quatin[3] = quat_k; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -51,11 +51,11 @@ NBinIntel::~NBinIntel() {
|
|||||||
const int * bins = this->bins;
|
const int * bins = this->bins;
|
||||||
const int * _atombin = this->_atombin;
|
const int * _atombin = this->_atombin;
|
||||||
const int * _binpacked = this->_binpacked;
|
const int * _binpacked = this->_binpacked;
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(binhead,bins,_atombin,_binpacked:alloc_if(0) free_if(1))
|
nocopy(binhead,bins,_atombin,_binpacked:alloc_if(0) free_if(1))
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
setup for bin_atoms()
|
setup for bin_atoms()
|
||||||
@ -70,8 +70,8 @@ void NBinIntel::bin_atoms_setup(int nall)
|
|||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (_offload_alloc) {
|
if (_offload_alloc) {
|
||||||
const int * binhead = this->binhead;
|
const int * binhead = this->binhead;
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(binhead:alloc_if(0) free_if(1))
|
nocopy(binhead:alloc_if(0) free_if(1))
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -98,8 +98,8 @@ void NBinIntel::bin_atoms_setup(int nall)
|
|||||||
const int * bins = this->bins;
|
const int * bins = this->bins;
|
||||||
const int * _atombin = this->_atombin;
|
const int * _atombin = this->_atombin;
|
||||||
const int * _binpacked = this->_binpacked;
|
const int * _binpacked = this->_binpacked;
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1))
|
nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1))
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
memory->destroy(bins);
|
memory->destroy(bins);
|
||||||
@ -157,10 +157,10 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
|
|||||||
const flt_t dx = (INTEL_BIGP - bboxhi[0]);
|
const flt_t dx = (INTEL_BIGP - bboxhi[0]);
|
||||||
const flt_t dy = (INTEL_BIGP - bboxhi[1]);
|
const flt_t dy = (INTEL_BIGP - bboxhi[1]);
|
||||||
const flt_t dz = (INTEL_BIGP - bboxhi[2]);
|
const flt_t dz = (INTEL_BIGP - bboxhi[2]);
|
||||||
if (dx * dx + dy * dy + dz * dz <
|
if (dx * dx + dy * dy + dz * dz <
|
||||||
static_cast<flt_t>(neighbor->cutneighmaxsq))
|
static_cast<flt_t>(neighbor->cutneighmaxsq))
|
||||||
error->one(FLERR,
|
error->one(FLERR,
|
||||||
"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
|
"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------- Grow and cast/pack buffers -------------
|
// ---------- Grow and cast/pack buffers -------------
|
||||||
@ -183,7 +183,7 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
|
|||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
|
||||||
sizeof(ATOM_T));
|
sizeof(ATOM_T));
|
||||||
buffers->thr_pack(ifrom, ito, 0);
|
buffers->thr_pack(ifrom, ito, 0);
|
||||||
}
|
}
|
||||||
_fix->stop_watch(TIME_PACK);
|
_fix->stop_watch(TIME_PACK);
|
||||||
|
|||||||
@ -70,48 +70,48 @@ fbi(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end,
|
buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end,
|
||||||
_fix->nbor_pack_width());
|
_fix->nbor_pack_width());
|
||||||
|
|
||||||
int need_ic = 0;
|
int need_ic = 0;
|
||||||
if (atom->molecular)
|
if (atom->molecular)
|
||||||
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
||||||
neighbor->cutneighmax);
|
neighbor->cutneighmax);
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (_fix->three_body_neighbor()) {
|
if (_fix->three_body_neighbor()) {
|
||||||
if (need_ic) {
|
if (need_ic) {
|
||||||
if (offload_noghost) {
|
if (offload_noghost) {
|
||||||
bin_newton<flt_t,acc_t,1,1,1,0,1>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,1,1,1,0,1>(1, list, buffers, 0, off_end);
|
||||||
bin_newton<flt_t,acc_t,1,1,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
|
bin_newton<flt_t,acc_t,1,1,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
|
||||||
} else {
|
} else {
|
||||||
bin_newton<flt_t,acc_t,0,1,1,0,1>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,0,1,1,0,1>(1, list, buffers, 0, off_end);
|
||||||
bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (offload_noghost) {
|
if (offload_noghost) {
|
||||||
bin_newton<flt_t,acc_t,1,0,1,0,1>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,1,0,1,0,1>(1, list, buffers, 0, off_end);
|
||||||
bin_newton<flt_t,acc_t,1,0,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
|
bin_newton<flt_t,acc_t,1,0,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
|
||||||
} else {
|
} else {
|
||||||
bin_newton<flt_t,acc_t,0,0,1,0,1>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,0,0,1,0,1>(1, list, buffers, 0, off_end);
|
||||||
bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (need_ic) {
|
if (need_ic) {
|
||||||
if (offload_noghost) {
|
if (offload_noghost) {
|
||||||
bin_newton<flt_t,acc_t,1,1,1,0,0>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,1,1,1,0,0>(1, list, buffers, 0, off_end);
|
||||||
bin_newton<flt_t,acc_t,1,1,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
|
bin_newton<flt_t,acc_t,1,1,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
|
||||||
} else {
|
} else {
|
||||||
bin_newton<flt_t,acc_t,0,1,1,0,0>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,0,1,1,0,0>(1, list, buffers, 0, off_end);
|
||||||
bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (offload_noghost) {
|
if (offload_noghost) {
|
||||||
bin_newton<flt_t,acc_t,1,0,1,0,0>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,1,0,1,0,0>(1, list, buffers, 0, off_end);
|
||||||
bin_newton<flt_t,acc_t,1,0,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
|
bin_newton<flt_t,acc_t,1,0,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
|
||||||
} else {
|
} else {
|
||||||
bin_newton<flt_t,acc_t,0,0,1,0,0>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,0,0,1,0,0>(1, list, buffers, 0, off_end);
|
||||||
bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -15,7 +15,7 @@
|
|||||||
|
|
||||||
NPairStyle(full/bin/intel,
|
NPairStyle(full/bin/intel,
|
||||||
NPairFullBinIntel,
|
NPairFullBinIntel,
|
||||||
NP_FULL | NP_BIN | NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI |
|
NP_FULL | NP_BIN | NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI |
|
||||||
NP_INTEL)
|
NP_INTEL)
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
|||||||
@ -26,7 +26,7 @@ using namespace LAMMPS_NS;
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
NPairHalfBinNewtonIntel::NPairHalfBinNewtonIntel(LAMMPS *lmp) :
|
NPairHalfBinNewtonIntel::NPairHalfBinNewtonIntel(LAMMPS *lmp) :
|
||||||
NPairIntel(lmp) {}
|
NPairIntel(lmp) {}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
@ -75,14 +75,14 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
|
|||||||
int need_ic = 0;
|
int need_ic = 0;
|
||||||
if (atom->molecular)
|
if (atom->molecular)
|
||||||
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
||||||
neighbor->cutneighmax);
|
neighbor->cutneighmax);
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (need_ic) {
|
if (need_ic) {
|
||||||
if (offload_noghost) {
|
if (offload_noghost) {
|
||||||
bin_newton<flt_t,acc_t,1,1,0,0,0>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,1,1,0,0,0>(1, list, buffers, 0, off_end);
|
||||||
bin_newton<flt_t,acc_t,1,1,0,0,0>(0, list, buffers, host_start, nlocal,
|
bin_newton<flt_t,acc_t,1,1,0,0,0>(0, list, buffers, host_start, nlocal,
|
||||||
off_end);
|
off_end);
|
||||||
} else {
|
} else {
|
||||||
bin_newton<flt_t,acc_t,0,1,0,0,0>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,0,1,0,0,0>(1, list, buffers, 0, off_end);
|
||||||
bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
|
||||||
@ -90,7 +90,7 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
|
|||||||
} else {
|
} else {
|
||||||
if (offload_noghost) {
|
if (offload_noghost) {
|
||||||
bin_newton<flt_t,acc_t,1,0,0,0,0>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,1,0,0,0,0>(1, list, buffers, 0, off_end);
|
||||||
bin_newton<flt_t,acc_t,1,0,0,0,0>(0, list, buffers, host_start, nlocal,
|
bin_newton<flt_t,acc_t,1,0,0,0,0>(0, list, buffers, host_start, nlocal,
|
||||||
off_end);
|
off_end);
|
||||||
} else {
|
} else {
|
||||||
bin_newton<flt_t,acc_t,0,0,0,0,0>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,0,0,0,0,0>(1, list, buffers, 0, off_end);
|
||||||
@ -98,7 +98,7 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (need_ic)
|
if (need_ic)
|
||||||
bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
|
||||||
else
|
else
|
||||||
bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal);
|
||||||
|
|||||||
@ -26,7 +26,7 @@ using namespace LAMMPS_NS;
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
NPairHalfBinNewtonTriIntel::NPairHalfBinNewtonTriIntel(LAMMPS *lmp) :
|
NPairHalfBinNewtonTriIntel::NPairHalfBinNewtonTriIntel(LAMMPS *lmp) :
|
||||||
NPairIntel(lmp) {}
|
NPairIntel(lmp) {}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
@ -75,14 +75,14 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
|
|||||||
int need_ic = 0;
|
int need_ic = 0;
|
||||||
if (atom->molecular)
|
if (atom->molecular)
|
||||||
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
||||||
neighbor->cutneighmax);
|
neighbor->cutneighmax);
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (need_ic) {
|
if (need_ic) {
|
||||||
if (offload_noghost) {
|
if (offload_noghost) {
|
||||||
bin_newton<flt_t,acc_t,1,1,0,1,0>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,1,1,0,1,0>(1, list, buffers, 0, off_end);
|
||||||
bin_newton<flt_t,acc_t,1,1,0,1,0>(0, list, buffers, host_start, nlocal,
|
bin_newton<flt_t,acc_t,1,1,0,1,0>(0, list, buffers, host_start, nlocal,
|
||||||
off_end);
|
off_end);
|
||||||
} else {
|
} else {
|
||||||
bin_newton<flt_t,acc_t,0,1,0,1,0>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,0,1,0,1,0>(1, list, buffers, 0, off_end);
|
||||||
bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal);
|
||||||
@ -90,8 +90,8 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
|
|||||||
} else {
|
} else {
|
||||||
if (offload_noghost) {
|
if (offload_noghost) {
|
||||||
bin_newton<flt_t,acc_t,1,0,0,1,0>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,1,0,0,1,0>(1, list, buffers, 0, off_end);
|
||||||
bin_newton<flt_t,acc_t,1,0,0,1,0>(0, list, buffers, host_start, nlocal,
|
bin_newton<flt_t,acc_t,1,0,0,1,0>(0, list, buffers, host_start, nlocal,
|
||||||
off_end);
|
off_end);
|
||||||
} else {
|
} else {
|
||||||
bin_newton<flt_t,acc_t,0,0,0,1,0>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,0,0,0,1,0>(1, list, buffers, 0, off_end);
|
||||||
bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal);
|
||||||
|
|||||||
@ -40,7 +40,7 @@ NPairIntel::~NPairIntel() {
|
|||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (_off_map_stencil) {
|
if (_off_map_stencil) {
|
||||||
const int * stencil = this->stencil;
|
const int * stencil = this->stencil;
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(stencil:alloc_if(0) free_if(1))
|
nocopy(stencil:alloc_if(0) free_if(1))
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -49,10 +49,10 @@ NPairIntel::~NPairIntel() {
|
|||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <class flt_t, class acc_t, int offload_noghost, int need_ic,
|
template <class flt_t, class acc_t, int offload_noghost, int need_ic,
|
||||||
int FULL, int TRI, int THREE>
|
int FULL, int TRI, int THREE>
|
||||||
void NPairIntel::bin_newton(const int offload, NeighList *list,
|
void NPairIntel::bin_newton(const int offload, NeighList *list,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const int astart, const int aend,
|
const int astart, const int aend,
|
||||||
const int offload_end) {
|
const int offload_end) {
|
||||||
|
|
||||||
if (aend-astart == 0) return;
|
if (aend-astart == 0) return;
|
||||||
@ -66,7 +66,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||||||
if (THREE == 0 && offload) {
|
if (THREE == 0 && offload) {
|
||||||
if (INTEL_MIC_NBOR_PAD > 1)
|
if (INTEL_MIC_NBOR_PAD > 1)
|
||||||
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
||||||
} else
|
} else
|
||||||
#endif
|
#endif
|
||||||
if (THREE == 0 && INTEL_NBOR_PAD > 1)
|
if (THREE == 0 && INTEL_NBOR_PAD > 1)
|
||||||
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
||||||
@ -120,7 +120,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||||||
overflow = _fix->get_off_overflow_flag();
|
overflow = _fix->get_off_overflow_flag();
|
||||||
_fix->stop_watch(TIME_HOST_NEIGHBOR);
|
_fix->stop_watch(TIME_HOST_NEIGHBOR);
|
||||||
_fix->start_watch(TIME_OFFLOAD_LATENCY);
|
_fix->start_watch(TIME_OFFLOAD_LATENCY);
|
||||||
} else
|
} else
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
tnum = comm->nthreads;
|
tnum = comm->nthreads;
|
||||||
@ -193,8 +193,8 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||||||
int end = stencil[k] + 1;
|
int end = stencil[k] + 1;
|
||||||
for (int kk = k + 1; kk < nstencil; kk++) {
|
for (int kk = k + 1; kk < nstencil; kk++) {
|
||||||
if (stencil[kk-1]+1 == stencil[kk]) {
|
if (stencil[kk-1]+1 == stencil[kk]) {
|
||||||
end++;
|
end++;
|
||||||
k++;
|
k++;
|
||||||
} else break;
|
} else break;
|
||||||
}
|
}
|
||||||
binend[nstencilp] = end;
|
binend[nstencilp] = end;
|
||||||
@ -214,16 +214,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||||||
int tid, ifrom, ito;
|
int tid, ifrom, ito;
|
||||||
|
|
||||||
if (THREE) {
|
if (THREE) {
|
||||||
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
|
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
|
||||||
} else {
|
} else {
|
||||||
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
|
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
|
||||||
}
|
}
|
||||||
ifrom += astart;
|
ifrom += astart;
|
||||||
ito += astart;
|
ito += astart;
|
||||||
int e_ito = ito;
|
int e_ito = ito;
|
||||||
if (THREE && ito == num) {
|
if (THREE && ito == num) {
|
||||||
int imod = ito % pack_width;
|
int imod = ito % pack_width;
|
||||||
if (imod) e_ito += pack_width - imod;
|
if (imod) e_ito += pack_width - imod;
|
||||||
}
|
}
|
||||||
const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
|
const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
|
||||||
|
|
||||||
@ -251,313 +251,313 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||||||
// loop over all atoms in other bins in stencil, store every pair
|
// loop over all atoms in other bins in stencil, store every pair
|
||||||
int istart, icount, ncount, oldbin = -9999999, lane, max_chunk;
|
int istart, icount, ncount, oldbin = -9999999, lane, max_chunk;
|
||||||
if (THREE) {
|
if (THREE) {
|
||||||
lane = 0;
|
lane = 0;
|
||||||
max_chunk = 0;
|
max_chunk = 0;
|
||||||
}
|
}
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
const flt_t xtmp = x[i].x;
|
const flt_t xtmp = x[i].x;
|
||||||
const flt_t ytmp = x[i].y;
|
const flt_t ytmp = x[i].y;
|
||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
const int itype = x[i].w;
|
const int itype = x[i].w;
|
||||||
tagint itag;
|
tagint itag;
|
||||||
if (THREE) itag = tag[i];
|
if (THREE) itag = tag[i];
|
||||||
const int ioffset = ntypes * itype;
|
const int ioffset = ntypes * itype;
|
||||||
|
|
||||||
const int ibin = atombin[i];
|
const int ibin = atombin[i];
|
||||||
if (ibin != oldbin) {
|
if (ibin != oldbin) {
|
||||||
oldbin = ibin;
|
oldbin = ibin;
|
||||||
ncount = 0;
|
ncount = 0;
|
||||||
for (int k = 0; k < nstencilp; k++) {
|
for (int k = 0; k < nstencilp; k++) {
|
||||||
const int bstart = binhead[ibin + binstart[k]];
|
const int bstart = binhead[ibin + binstart[k]];
|
||||||
const int bend = binhead[ibin + binend[k]];
|
const int bend = binhead[ibin + binend[k]];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
for (int jj = bstart; jj < bend; jj++)
|
for (int jj = bstart; jj < bend; jj++)
|
||||||
tj[ncount++] = binpacked[jj];
|
tj[ncount++] = binpacked[jj];
|
||||||
}
|
}
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
for (int u = 0; u < ncount; u++) {
|
for (int u = 0; u < ncount; u++) {
|
||||||
const int j = tj[u];
|
const int j = tj[u];
|
||||||
tx[u] = x[j].x;
|
tx[u] = x[j].x;
|
||||||
ty[u] = x[j].y;
|
ty[u] = x[j].y;
|
||||||
tz[u] = x[j].z;
|
tz[u] = x[j].z;
|
||||||
tjtype[u] = x[j].w;
|
tjtype[u] = x[j].w;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (FULL == 0 || TRI == 1) {
|
if (FULL == 0 || TRI == 1) {
|
||||||
icount = 0;
|
icount = 0;
|
||||||
istart = ncount;
|
istart = ncount;
|
||||||
const int alignb = INTEL_DATA_ALIGN / sizeof(int);
|
const int alignb = INTEL_DATA_ALIGN / sizeof(int);
|
||||||
int nedge = istart % alignb;
|
int nedge = istart % alignb;
|
||||||
if (nedge) istart + (alignb - nedge);
|
if (nedge) istart + (alignb - nedge);
|
||||||
itx = tx + istart;
|
itx = tx + istart;
|
||||||
ity = ty + istart;
|
ity = ty + istart;
|
||||||
itz = tz + istart;
|
itz = tz + istart;
|
||||||
itj = tj + istart;
|
itj = tj + istart;
|
||||||
itjtype = tjtype + istart;
|
itjtype = tjtype + istart;
|
||||||
|
|
||||||
const int bstart = binhead[ibin];
|
const int bstart = binhead[ibin];
|
||||||
const int bend = binhead[ibin + 1];
|
const int bend = binhead[ibin + 1];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
for (int jj = bstart; jj < bend; jj++) {
|
for (int jj = bstart; jj < bend; jj++) {
|
||||||
const int j = binpacked[jj];
|
const int j = binpacked[jj];
|
||||||
itj[icount] = j;
|
itj[icount] = j;
|
||||||
itx[icount] = x[j].x;
|
itx[icount] = x[j].x;
|
||||||
ity[icount] = x[j].y;
|
ity[icount] = x[j].y;
|
||||||
itz[icount] = x[j].z;
|
itz[icount] = x[j].z;
|
||||||
itjtype[icount] = x[j].w;
|
itjtype[icount] = x[j].w;
|
||||||
icount++;
|
icount++;
|
||||||
}
|
}
|
||||||
if (icount + istart > obound) *overflow = 1;
|
if (icount + istart > obound) *overflow = 1;
|
||||||
} else
|
} else
|
||||||
if (ncount > obound) *overflow = 1;
|
if (ncount > obound) *overflow = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------- Loop over i bin
|
// ---------------------- Loop over i bin
|
||||||
|
|
||||||
int n = 0;
|
int n = 0;
|
||||||
if (FULL == 0 || TRI == 1) {
|
if (FULL == 0 || TRI == 1) {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#endif
|
#endif
|
||||||
for (int u = 0; u < icount; u++) {
|
for (int u = 0; u < icount; u++) {
|
||||||
int addme = 1;
|
int addme = 1;
|
||||||
int j = itj[u];
|
int j = itj[u];
|
||||||
|
|
||||||
// Cutoff Check
|
// Cutoff Check
|
||||||
const flt_t delx = xtmp - itx[u];
|
const flt_t delx = xtmp - itx[u];
|
||||||
const flt_t dely = ytmp - ity[u];
|
const flt_t dely = ytmp - ity[u];
|
||||||
const flt_t delz = ztmp - itz[u];
|
const flt_t delz = ztmp - itz[u];
|
||||||
const int jtype = itjtype[u];
|
const int jtype = itjtype[u];
|
||||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||||
if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
|
if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
|
||||||
|
|
||||||
// i bin (half) check and offload ghost check
|
// i bin (half) check and offload ghost check
|
||||||
if (j < nlocal) {
|
if (j < nlocal) {
|
||||||
const int ijmod = (i + j) % 2;
|
const int ijmod = (i + j) % 2;
|
||||||
if (i > j) {
|
if (i > j) {
|
||||||
if (ijmod == 0) addme = 0;
|
if (ijmod == 0) addme = 0;
|
||||||
} else if (i < j) {
|
} else if (i < j) {
|
||||||
if (ijmod == 1) addme = 0;
|
if (ijmod == 1) addme = 0;
|
||||||
} else
|
} else
|
||||||
addme = 0;
|
addme = 0;
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (offload_noghost && i < offload_end) addme = 0;
|
if (offload_noghost && i < offload_end) addme = 0;
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (offload_noghost && offload) addme = 0;
|
if (offload_noghost && offload) addme = 0;
|
||||||
#endif
|
#endif
|
||||||
if (itz[u] < ztmp) addme = 0;
|
if (itz[u] < ztmp) addme = 0;
|
||||||
if (itz[u] == ztmp) {
|
if (itz[u] == ztmp) {
|
||||||
if (ity[u] < ytmp) addme = 0;
|
if (ity[u] < ytmp) addme = 0;
|
||||||
if (ity[u] == ytmp && itx[u] < xtmp) addme = 0;
|
if (ity[u] == ytmp && itx[u] < xtmp) addme = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (need_ic) {
|
if (need_ic) {
|
||||||
int no_special;
|
int no_special;
|
||||||
ominimum_image_check(no_special, delx, dely, delz);
|
ominimum_image_check(no_special, delx, dely, delz);
|
||||||
if (no_special)
|
if (no_special)
|
||||||
j = -j - 1;
|
j = -j - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (addme)
|
if (addme)
|
||||||
neighptr[n++] = j;
|
neighptr[n++] = j;
|
||||||
}
|
}
|
||||||
} // if FULL==0
|
} // if FULL==0
|
||||||
|
|
||||||
// ---------------------- Loop over other bins
|
// ---------------------- Loop over other bins
|
||||||
|
|
||||||
int n2, *neighptr2;
|
int n2, *neighptr2;
|
||||||
if (THREE) {
|
if (THREE) {
|
||||||
n = pack_offset;
|
n = pack_offset;
|
||||||
n2 = pack_offset + maxnbors;
|
n2 = pack_offset + maxnbors;
|
||||||
neighptr2 = neighptr;
|
neighptr2 = neighptr;
|
||||||
}
|
}
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#endif
|
#endif
|
||||||
for (int u = 0; u < ncount; u++) {
|
for (int u = 0; u < ncount; u++) {
|
||||||
int addme = 1;
|
int addme = 1;
|
||||||
int j = tj[u];
|
int j = tj[u];
|
||||||
|
|
||||||
if (FULL)
|
if (FULL)
|
||||||
if (i == j) addme = 0;
|
if (i == j) addme = 0;
|
||||||
|
|
||||||
// Cutoff Check
|
// Cutoff Check
|
||||||
const flt_t delx = xtmp - tx[u];
|
const flt_t delx = xtmp - tx[u];
|
||||||
const flt_t dely = ytmp - ty[u];
|
const flt_t dely = ytmp - ty[u];
|
||||||
const flt_t delz = ztmp - tz[u];
|
const flt_t delz = ztmp - tz[u];
|
||||||
const int jtype = tjtype[u];
|
const int jtype = tjtype[u];
|
||||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||||
if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
|
if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
|
||||||
|
|
||||||
// Triclinic
|
// Triclinic
|
||||||
if (TRI) {
|
if (TRI) {
|
||||||
if (tz[u] < ztmp) addme = 0;
|
if (tz[u] < ztmp) addme = 0;
|
||||||
if (tz[u] == ztmp) {
|
if (tz[u] == ztmp) {
|
||||||
if (ty[u] < ytmp) addme = 0;
|
if (ty[u] < ytmp) addme = 0;
|
||||||
if (ty[u] == ytmp) {
|
if (ty[u] == ytmp) {
|
||||||
if (tx[u] < xtmp) addme = 0;
|
if (tx[u] < xtmp) addme = 0;
|
||||||
if (tx[u] == xtmp && j <= i) addme = 0;
|
if (tx[u] == xtmp && j <= i) addme = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// offload ghost check
|
// offload ghost check
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (offload_noghost) {
|
if (offload_noghost) {
|
||||||
if (j < nlocal) {
|
if (j < nlocal) {
|
||||||
if (i < offload_end) addme = 0;
|
if (i < offload_end) addme = 0;
|
||||||
} else if (offload) addme = 0;
|
} else if (offload) addme = 0;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int pj;
|
int pj;
|
||||||
if (THREE) pj = j;
|
if (THREE) pj = j;
|
||||||
if (need_ic) {
|
if (need_ic) {
|
||||||
int no_special;
|
int no_special;
|
||||||
ominimum_image_check(no_special, delx, dely, delz);
|
ominimum_image_check(no_special, delx, dely, delz);
|
||||||
if (no_special)
|
if (no_special)
|
||||||
j = -j - 1;
|
j = -j - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (THREE) {
|
if (THREE) {
|
||||||
const int jtag = tag[pj];
|
const int jtag = tag[pj];
|
||||||
int flist = 0;
|
int flist = 0;
|
||||||
if (itag > jtag) {
|
if (itag > jtag) {
|
||||||
if ((itag+jtag) % 2 == 0) flist = 1;
|
if ((itag+jtag) % 2 == 0) flist = 1;
|
||||||
} else if (itag < jtag) {
|
} else if (itag < jtag) {
|
||||||
if ((itag+jtag) % 2 == 1) flist = 1;
|
if ((itag+jtag) % 2 == 1) flist = 1;
|
||||||
} else {
|
} else {
|
||||||
if (tz[u] < ztmp) flist = 1;
|
if (tz[u] < ztmp) flist = 1;
|
||||||
else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
|
else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
|
||||||
else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp)
|
else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp)
|
||||||
flist = 1;
|
flist = 1;
|
||||||
}
|
}
|
||||||
if (addme) {
|
if (addme) {
|
||||||
if (flist)
|
if (flist)
|
||||||
neighptr2[n2++] = j;
|
neighptr2[n2++] = j;
|
||||||
else
|
else
|
||||||
neighptr[n++] = j;
|
neighptr[n++] = j;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (addme)
|
if (addme)
|
||||||
neighptr[n++] = j;
|
neighptr[n++] = j;
|
||||||
}
|
}
|
||||||
} // for u
|
} // for u
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
if (exclude) {
|
if (exclude) {
|
||||||
int alln = n;
|
int alln = n;
|
||||||
if (THREE) n = pack_offset;
|
if (THREE) n = pack_offset;
|
||||||
else n = 0;
|
else n = 0;
|
||||||
for (int u = pack_offset; u < alln; u++) {
|
for (int u = pack_offset; u < alln; u++) {
|
||||||
const int j = neighptr[u];
|
const int j = neighptr[u];
|
||||||
int pj = j;
|
int pj = j;
|
||||||
if (need_ic)
|
if (need_ic)
|
||||||
if (pj < 0) pj = -j - 1;
|
if (pj < 0) pj = -j - 1;
|
||||||
const int jtype = x[pj].w;
|
const int jtype = x[pj].w;
|
||||||
if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
|
if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
|
||||||
neighptr[n++] = j;
|
neighptr[n++] = j;
|
||||||
|
}
|
||||||
|
if (THREE) {
|
||||||
|
alln = n2;
|
||||||
|
n2 = pack_offset + maxnbors;
|
||||||
|
for (int u = pack_offset + maxnbors; u < alln; u++) {
|
||||||
|
const int j = neighptr[u];
|
||||||
|
int pj = j;
|
||||||
|
if (need_ic)
|
||||||
|
if (pj < 0) pj = -j - 1;
|
||||||
|
const int jtype = x[pj].w;
|
||||||
|
if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
|
||||||
|
neighptr[n2++] = j;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (THREE) {
|
|
||||||
alln = n2;
|
|
||||||
n2 = pack_offset + maxnbors;
|
|
||||||
for (int u = pack_offset + maxnbors; u < alln; u++) {
|
|
||||||
const int j = neighptr[u];
|
|
||||||
int pj = j;
|
|
||||||
if (need_ic)
|
|
||||||
if (pj < 0) pj = -j - 1;
|
|
||||||
const int jtype = x[pj].w;
|
|
||||||
if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
|
|
||||||
neighptr[n2++] = j;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
int ns;
|
int ns;
|
||||||
if (THREE) {
|
if (THREE) {
|
||||||
int alln = n;
|
int alln = n;
|
||||||
ns = n - pack_offset;
|
ns = n - pack_offset;
|
||||||
atombin[i] = ns;
|
atombin[i] = ns;
|
||||||
n = lane;
|
n = lane;
|
||||||
for (int u = pack_offset; u < alln; u++) {
|
for (int u = pack_offset; u < alln; u++) {
|
||||||
neighptr[n] = neighptr[u];
|
neighptr[n] = neighptr[u];
|
||||||
n += pack_width;
|
n += pack_width;
|
||||||
}
|
}
|
||||||
ns += n2 - pack_offset - maxnbors;
|
ns += n2 - pack_offset - maxnbors;
|
||||||
for (int u = pack_offset + maxnbors; u < n2; u++) {
|
for (int u = pack_offset + maxnbors; u < n2; u++) {
|
||||||
neighptr[n] = neighptr[u];
|
neighptr[n] = neighptr[u];
|
||||||
n += pack_width;
|
n += pack_width;
|
||||||
}
|
}
|
||||||
if (ns > maxnbors) *overflow = 1;
|
if (ns > maxnbors) *overflow = 1;
|
||||||
} else
|
} else
|
||||||
if (n > maxnbors) *overflow = 1;
|
if (n > maxnbors) *overflow = 1;
|
||||||
|
|
||||||
ilist[i] = i;
|
ilist[i] = i;
|
||||||
cnumneigh[i] = ct;
|
cnumneigh[i] = ct;
|
||||||
if (THREE) {
|
if (THREE) {
|
||||||
cnumneigh[i] += lane;
|
cnumneigh[i] += lane;
|
||||||
numneigh[i] = ns;
|
numneigh[i] = ns;
|
||||||
} else {
|
} else {
|
||||||
int edge = (n % pad_width);
|
int edge = (n % pad_width);
|
||||||
if (edge) {
|
if (edge) {
|
||||||
const int pad_end = n + (pad_width - edge);
|
const int pad_end = n + (pad_width - edge);
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
|
#pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
|
||||||
avg=INTEL_COMPILE_WIDTH/2
|
avg=INTEL_COMPILE_WIDTH/2
|
||||||
#endif
|
#endif
|
||||||
for ( ; n < pad_end; n++)
|
for ( ; n < pad_end; n++)
|
||||||
neighptr[n] = e_nall;
|
neighptr[n] = e_nall;
|
||||||
}
|
}
|
||||||
numneigh[i] = n;
|
numneigh[i] = n;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (THREE) {
|
if (THREE) {
|
||||||
if (ns > max_chunk) max_chunk = ns;
|
if (ns > max_chunk) max_chunk = ns;
|
||||||
lane++;
|
lane++;
|
||||||
if (lane == pack_width) {
|
if (lane == pack_width) {
|
||||||
ct += max_chunk * pack_width;
|
ct += max_chunk * pack_width;
|
||||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||||
const int edge = (ct % alignb);
|
const int edge = (ct % alignb);
|
||||||
if (edge) ct += alignb - edge;
|
if (edge) ct += alignb - edge;
|
||||||
neighptr = firstneigh + ct;
|
neighptr = firstneigh + ct;
|
||||||
max_chunk = 0;
|
max_chunk = 0;
|
||||||
pack_offset = maxnbors * pack_width;
|
pack_offset = maxnbors * pack_width;
|
||||||
lane = 0;
|
lane = 0;
|
||||||
if (ct + obound > list_size) {
|
if (ct + obound > list_size) {
|
||||||
if (i < ito - 1) {
|
if (i < ito - 1) {
|
||||||
*overflow = 1;
|
*overflow = 1;
|
||||||
ct = (ifrom + tid * 2) * maxnbors;
|
ct = (ifrom + tid * 2) * maxnbors;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ct += n;
|
ct += n;
|
||||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||||
const int edge = (ct % alignb);
|
const int edge = (ct % alignb);
|
||||||
if (edge) ct += alignb - edge;
|
if (edge) ct += alignb - edge;
|
||||||
neighptr = firstneigh + ct;
|
neighptr = firstneigh + ct;
|
||||||
if (ct + obound > list_size) {
|
if (ct + obound > list_size) {
|
||||||
if (i < ito - 1) {
|
if (i < ito - 1) {
|
||||||
*overflow = 1;
|
*overflow = 1;
|
||||||
ct = (ifrom + tid * 2) * maxnbors;
|
ct = (ifrom + tid * 2) * maxnbors;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (*overflow == 1)
|
if (*overflow == 1)
|
||||||
@ -568,16 +568,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||||||
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
|
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
|
||||||
int ghost_offset = 0, nall_offset = e_nall;
|
int ghost_offset = 0, nall_offset = e_nall;
|
||||||
if (separate_buffers) {
|
if (separate_buffers) {
|
||||||
for (int i = ifrom; i < ito; ++i) {
|
for (int i = ifrom; i < ito; ++i) {
|
||||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||||
const int jnum = numneigh[i];
|
const int jnum = numneigh[i];
|
||||||
#if __INTEL_COMPILER+0 > 1499
|
#if __INTEL_COMPILER+0 > 1499
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
|
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
int j = jlist[jj];
|
int j = jlist[jj];
|
||||||
if (need_ic && j < 0) j = -j - 1;
|
if (need_ic && j < 0) j = -j - 1;
|
||||||
if (j < nlocal) {
|
if (j < nlocal) {
|
||||||
if (j < vlmin) vlmin = j;
|
if (j < vlmin) vlmin = j;
|
||||||
if (j > vlmax) vlmax = j;
|
if (j > vlmax) vlmax = j;
|
||||||
@ -585,33 +585,33 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||||||
if (j < vgmin) vgmin = j;
|
if (j < vgmin) vgmin = j;
|
||||||
if (j > vgmax) vgmax = j;
|
if (j > vgmax) vgmax = j;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
lmin = MIN(lmin,vlmin);
|
lmin = MIN(lmin,vlmin);
|
||||||
gmin = MIN(gmin,vgmin);
|
gmin = MIN(gmin,vgmin);
|
||||||
lmax = MAX(lmax,vlmax);
|
lmax = MAX(lmax,vlmax);
|
||||||
gmax = MAX(gmax,vgmax);
|
gmax = MAX(gmax,vgmax);
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp critical
|
#pragma omp critical
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
|
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
|
||||||
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
|
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
|
||||||
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
|
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
|
||||||
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
|
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
|
||||||
|
}
|
||||||
|
#pragma omp barrier
|
||||||
|
|
||||||
|
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
|
||||||
|
if (nghost < 0) nghost = 0;
|
||||||
|
if (offload) {
|
||||||
|
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
|
||||||
|
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
|
||||||
|
} else {
|
||||||
|
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
|
||||||
|
nall_offset = nlocal + nghost;
|
||||||
}
|
}
|
||||||
#pragma omp barrier
|
|
||||||
|
|
||||||
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
|
|
||||||
if (nghost < 0) nghost = 0;
|
|
||||||
if (offload) {
|
|
||||||
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
|
|
||||||
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
|
|
||||||
} else {
|
|
||||||
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
|
|
||||||
nall_offset = nlocal + nghost;
|
|
||||||
}
|
|
||||||
} // if separate_buffers
|
} // if separate_buffers
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -620,67 +620,67 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||||
const int jnum = numneigh[i];
|
const int jnum = numneigh[i];
|
||||||
|
|
||||||
if (THREE) {
|
if (THREE) {
|
||||||
const int trip = jnum * pack_width;
|
const int trip = jnum * pack_width;
|
||||||
for (int jj = 0; jj < trip; jj+=pack_width) {
|
for (int jj = 0; jj < trip; jj+=pack_width) {
|
||||||
const int j = jlist[jj];
|
const int j = jlist[jj];
|
||||||
if (need_ic && j < 0) {
|
if (need_ic && j < 0) {
|
||||||
which = 0;
|
which = 0;
|
||||||
jlist[jj] = -j - 1;
|
jlist[jj] = -j - 1;
|
||||||
} else
|
} else
|
||||||
ofind_special(which, special, nspecial, i, tag[j]);
|
ofind_special(which, special, nspecial, i, tag[j]);
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (j >= nlocal) {
|
if (j >= nlocal) {
|
||||||
if (j == e_nall)
|
if (j == e_nall)
|
||||||
jlist[jj] = nall_offset;
|
jlist[jj] = nall_offset;
|
||||||
else if (which)
|
else if (which)
|
||||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||||
else jlist[jj]-=ghost_offset;
|
else jlist[jj]-=ghost_offset;
|
||||||
} else
|
} else
|
||||||
#endif
|
#endif
|
||||||
if (which) jlist[jj] = j ^ (which << SBBITS);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
|
||||||
#pragma vector aligned
|
|
||||||
#pragma simd
|
|
||||||
#endif
|
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
|
||||||
const int j = jlist[jj];
|
|
||||||
if (need_ic && j < 0) {
|
|
||||||
which = 0;
|
|
||||||
jlist[jj] = -j - 1;
|
|
||||||
} else
|
|
||||||
ofind_special(which, special, nspecial, i, tag[j]);
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (j >= nlocal) {
|
|
||||||
if (j == e_nall)
|
|
||||||
jlist[jj] = nall_offset;
|
|
||||||
else if (which)
|
|
||||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
|
||||||
else jlist[jj]-=ghost_offset;
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
if (which) jlist[jj] = j ^ (which << SBBITS);
|
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
} // for i
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
|
const int j = jlist[jj];
|
||||||
|
if (need_ic && j < 0) {
|
||||||
|
which = 0;
|
||||||
|
jlist[jj] = -j - 1;
|
||||||
|
} else
|
||||||
|
ofind_special(which, special, nspecial, i, tag[j]);
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (j >= nlocal) {
|
||||||
|
if (j == e_nall)
|
||||||
|
jlist[jj] = nall_offset;
|
||||||
|
else if (which)
|
||||||
|
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||||
|
else jlist[jj]-=ghost_offset;
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // for i
|
||||||
} // if molecular
|
} // if molecular
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
else if (separate_buffers) {
|
else if (separate_buffers) {
|
||||||
for (int i = ifrom; i < ito; ++i) {
|
for (int i = ifrom; i < ito; ++i) {
|
||||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||||
const int jnum = numneigh[i];
|
const int jnum = numneigh[i];
|
||||||
int jj = 0;
|
int jj = 0;
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd
|
#pragma simd
|
||||||
for (jj = 0; jj < jnum; jj++) {
|
for (jj = 0; jj < jnum; jj++) {
|
||||||
if (jlist[jj] >= nlocal) {
|
if (jlist[jj] >= nlocal) {
|
||||||
if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
|
if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
|
||||||
else jlist[jj] -= ghost_offset;
|
else jlist[jj] -= ghost_offset;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // end omp
|
} // end omp
|
||||||
@ -704,9 +704,9 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||||||
_fix->start_watch(TIME_PACK);
|
_fix->start_watch(TIME_PACK);
|
||||||
_fix->set_neighbor_host_sizes();
|
_fix->set_neighbor_host_sizes();
|
||||||
buffers->pack_sep_from_single(_fix->host_min_local(),
|
buffers->pack_sep_from_single(_fix->host_min_local(),
|
||||||
_fix->host_used_local(),
|
_fix->host_used_local(),
|
||||||
_fix->host_min_ghost(),
|
_fix->host_min_ghost(),
|
||||||
_fix->host_used_ghost());
|
_fix->host_used_ghost());
|
||||||
_fix->stop_watch(TIME_PACK);
|
_fix->stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -732,9 +732,9 @@ void NPairIntel::grow_stencil()
|
|||||||
_off_map_stencil = stencil;
|
_off_map_stencil = stencil;
|
||||||
const int * stencil = _off_map_stencil;
|
const int * stencil = _off_map_stencil;
|
||||||
const int maxstencil = ns->get_maxstencil();
|
const int maxstencil = ns->get_maxstencil();
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
in(stencil:length(maxstencil) alloc_if(1) free_if(0))
|
in(stencil:length(maxstencil) alloc_if(1) free_if(0))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@ -84,8 +84,8 @@ class NPairIntel : public NPair {
|
|||||||
FixIntel *_fix;
|
FixIntel *_fix;
|
||||||
|
|
||||||
template <class flt_t, class acc_t, int, int, int, int, int>
|
template <class flt_t, class acc_t, int, int, int, int, int>
|
||||||
void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *,
|
void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *,
|
||||||
const int, const int, const int offload_end = 0);
|
const int, const int, const int offload_end = 0);
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
int _cop;
|
int _cop;
|
||||||
|
|||||||
@ -55,7 +55,7 @@ PairBuckCoulCutIntel::~PairBuckCoulCutIntel()
|
|||||||
void PairBuckCoulCutIntel::compute(int eflag, int vflag)
|
void PairBuckCoulCutIntel::compute(int eflag, int vflag)
|
||||||
{
|
{
|
||||||
if (fix->precision()==FixIntel::PREC_MODE_MIXED)
|
if (fix->precision()==FixIntel::PREC_MODE_MIXED)
|
||||||
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
||||||
force_const_single);
|
force_const_single);
|
||||||
else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
|
else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
|
||||||
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
||||||
@ -70,8 +70,8 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag)
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void PairBuckCoulCutIntel::compute(int eflag, int vflag,
|
void PairBuckCoulCutIntel::compute(int eflag, int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
if (eflag || vflag) {
|
if (eflag || vflag) {
|
||||||
ev_setup(eflag,vflag);
|
ev_setup(eflag,vflag);
|
||||||
@ -94,13 +94,13 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag,
|
|||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||||
packthreads, sizeof(ATOM_T));
|
packthreads, sizeof(ATOM_T));
|
||||||
buffers->thr_pack(ifrom,ito,ago);
|
buffers->thr_pack(ifrom,ito,ago);
|
||||||
}
|
}
|
||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ovflag = 0;
|
int ovflag = 0;
|
||||||
if (vflag_fdotr) ovflag = 2;
|
if (vflag_fdotr) ovflag = 2;
|
||||||
else if (vflag) ovflag = 1;
|
else if (vflag) ovflag = 1;
|
||||||
@ -127,9 +127,9 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
const int astart, const int aend)
|
const int astart, const int aend)
|
||||||
{
|
{
|
||||||
const int inum = aend - astart;
|
const int inum = aend - astart;
|
||||||
if (inum == 0) return;
|
if (inum == 0) return;
|
||||||
@ -160,8 +160,8 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
|||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
int tc;
|
int tc;
|
||||||
FORCE_T * _noalias f_start;
|
FORCE_T * _noalias f_start;
|
||||||
@ -198,8 +198,8 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
|||||||
*timer_compute = MIC_Wtime();
|
*timer_compute = MIC_Wtime();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
||||||
f_stride, x, q);
|
f_stride, x, q);
|
||||||
|
|
||||||
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
||||||
@ -233,20 +233,20 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
acc_t fxtmp,fytmp,fztmp,fwtmp;
|
acc_t fxtmp,fytmp,fztmp,fwtmp;
|
||||||
acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
|
acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
|
|
||||||
const flt_t xtmp = x[i].x;
|
const flt_t xtmp = x[i].x;
|
||||||
const flt_t ytmp = x[i].y;
|
const flt_t ytmp = x[i].y;
|
||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
const flt_t qtmp = q[i];
|
const flt_t qtmp = q[i];
|
||||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
flt_t forcecoul, forcebuck, evdwl, ecoul;
|
flt_t forcecoul, forcebuck, evdwl, ecoul;
|
||||||
@ -262,19 +262,19 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
|||||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||||
const flt_t r = sqrt(rsq);
|
const flt_t r = sqrt(rsq);
|
||||||
const flt_t r2inv = (flt_t)1.0 / rsq;
|
const flt_t r2inv = (flt_t)1.0 / rsq;
|
||||||
|
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
if (rsq < c_cuti[jtype].cut_coulsq) {
|
if (rsq < c_cuti[jtype].cut_coulsq) {
|
||||||
#endif
|
#endif
|
||||||
forcecoul = qqrd2e * qtmp*q[j]/r;
|
forcecoul = qqrd2e * qtmp*q[j]/r;
|
||||||
if (EFLAG)
|
if (EFLAG)
|
||||||
ecoul = forcecoul;
|
ecoul = forcecoul;
|
||||||
if (sbindex){
|
if (sbindex){
|
||||||
const flt_t factor_coul = special_coul[sbindex];
|
const flt_t factor_coul = special_coul[sbindex];
|
||||||
forcecoul *= factor_coul;
|
forcecoul *= factor_coul;
|
||||||
if(EFLAG)
|
if(EFLAG)
|
||||||
ecoul *= factor_coul;
|
ecoul *= factor_coul;
|
||||||
|
|
||||||
}
|
}
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
}
|
}
|
||||||
@ -282,7 +282,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
|||||||
if (rsq >= c_cuti[jtype].cut_coulsq)
|
if (rsq >= c_cuti[jtype].cut_coulsq)
|
||||||
{ forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
|
{ forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
if (rsq < c_cuti[jtype].cut_ljsq) {
|
if (rsq < c_cuti[jtype].cut_ljsq) {
|
||||||
#endif
|
#endif
|
||||||
@ -290,14 +290,14 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
|||||||
flt_t rexp = exp(-r * c_forcei[jtype].rhoinv);
|
flt_t rexp = exp(-r * c_forcei[jtype].rhoinv);
|
||||||
forcebuck = r * rexp * c_forcei[jtype].buck1 -
|
forcebuck = r * rexp * c_forcei[jtype].buck1 -
|
||||||
r6inv * c_forcei[jtype].buck2;
|
r6inv * c_forcei[jtype].buck2;
|
||||||
if (EFLAG)
|
if (EFLAG)
|
||||||
evdwl = rexp * c_energyi[jtype].a -
|
evdwl = rexp * c_energyi[jtype].a -
|
||||||
r6inv * c_energyi[jtype].c -
|
r6inv * c_energyi[jtype].c -
|
||||||
c_energyi[jtype].offset;
|
c_energyi[jtype].offset;
|
||||||
if (sbindex) {
|
if (sbindex) {
|
||||||
const flt_t factor_lj = special_lj[sbindex];
|
const flt_t factor_lj = special_lj[sbindex];
|
||||||
forcebuck *= factor_lj;
|
forcebuck *= factor_lj;
|
||||||
if (EFLAG)
|
if (EFLAG)
|
||||||
evdwl *= factor_lj;
|
evdwl *= factor_lj;
|
||||||
}
|
}
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
@ -311,51 +311,51 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
|||||||
if (rsq < c_cuti[jtype].cutsq) {
|
if (rsq < c_cuti[jtype].cutsq) {
|
||||||
#endif
|
#endif
|
||||||
const flt_t fpair = (forcecoul + forcebuck) * r2inv;
|
const flt_t fpair = (forcecoul + forcebuck) * r2inv;
|
||||||
const flt_t fpx = fpair * delx;
|
const flt_t fpx = fpair * delx;
|
||||||
fxtmp += fpx;
|
fxtmp += fpx;
|
||||||
if (NEWTON_PAIR) f[j].x -= fpx;
|
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||||
const flt_t fpy = fpair * dely;
|
const flt_t fpy = fpair * dely;
|
||||||
fytmp += fpy;
|
fytmp += fpy;
|
||||||
if (NEWTON_PAIR) f[j].y -= fpy;
|
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||||
const flt_t fpz = fpair * delz;
|
const flt_t fpz = fpair * delz;
|
||||||
fztmp += fpz;
|
fztmp += fpz;
|
||||||
if (NEWTON_PAIR) f[j].z -= fpz;
|
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||||
|
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
sevdwl += evdwl;
|
sevdwl += evdwl;
|
||||||
secoul += ecoul;
|
secoul += ecoul;
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||||
if (NEWTON_PAIR)
|
if (NEWTON_PAIR)
|
||||||
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
|
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // for jj
|
} // for jj
|
||||||
if (NEWTON_PAIR) {
|
if (NEWTON_PAIR) {
|
||||||
f[i].x += fxtmp;
|
f[i].x += fxtmp;
|
||||||
f[i].y += fytmp;
|
f[i].y += fytmp;
|
||||||
f[i].z += fztmp;
|
f[i].z += fztmp;
|
||||||
} else {
|
} else {
|
||||||
f[i].x = fxtmp;
|
f[i].x = fxtmp;
|
||||||
f[i].y = fytmp;
|
f[i].y = fytmp;
|
||||||
f[i].z = fztmp;
|
f[i].z = fztmp;
|
||||||
}
|
}
|
||||||
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||||
} // for ii
|
} // for ii
|
||||||
|
|
||||||
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||||
ov4, ov5);
|
ov4, ov5);
|
||||||
} // end of omp parallel region
|
} // end of omp parallel region
|
||||||
|
|
||||||
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
||||||
@ -364,12 +364,12 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
if (vflag) {
|
if (vflag) {
|
||||||
if (NEWTON_PAIR == 0) {
|
if (NEWTON_PAIR == 0) {
|
||||||
ov0 *= (acc_t)0.5;
|
ov0 *= (acc_t)0.5;
|
||||||
ov1 *= (acc_t)0.5;
|
ov1 *= (acc_t)0.5;
|
||||||
ov2 *= (acc_t)0.5;
|
ov2 *= (acc_t)0.5;
|
||||||
ov3 *= (acc_t)0.5;
|
ov3 *= (acc_t)0.5;
|
||||||
ov4 *= (acc_t)0.5;
|
ov4 *= (acc_t)0.5;
|
||||||
ov5 *= (acc_t)0.5;
|
ov5 *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
ev_global[2] = ov0;
|
ev_global[2] = ov0;
|
||||||
ev_global[3] = ov1;
|
ev_global[3] = ov1;
|
||||||
@ -410,7 +410,7 @@ void PairBuckCoulCutIntel::init_style()
|
|||||||
error->all(FLERR,
|
error->all(FLERR,
|
||||||
"The 'package intel' command is required for /intel styles");
|
"The 'package intel' command is required for /intel styles");
|
||||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||||
|
|
||||||
fix->pair_init_check();
|
fix->pair_init_check();
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
_cop = fix->coprocessor_number();
|
_cop = fix->coprocessor_number();
|
||||||
@ -492,9 +492,9 @@ void PairBuckCoulCutIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
|
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
||||||
const int ntable,
|
const int ntable,
|
||||||
Memory *memory,
|
Memory *memory,
|
||||||
const int cop) {
|
const int cop) {
|
||||||
if ( (ntypes != _ntypes || ntable != _ntable) ) {
|
if ( (ntypes != _ntypes || ntable != _ntable) ) {
|
||||||
if (_ntypes > 0) {
|
if (_ntypes > 0) {
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
@ -505,12 +505,12 @@ void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|||||||
c_cut_t * oc_cut = c_cut[0];
|
c_cut_t * oc_cut = c_cut[0];
|
||||||
|
|
||||||
if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL &&
|
if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL &&
|
||||||
oc_energy != NULL && ospecial_coul != NULL &&
|
oc_energy != NULL && ospecial_coul != NULL &&
|
||||||
_cop >= 0) {
|
_cop >= 0) {
|
||||||
#pragma offload_transfer target(mic:cop) \
|
#pragma offload_transfer target(mic:cop) \
|
||||||
nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
|
nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
|
||||||
nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
|
nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
|
||||||
nocopy(oc_cut: alloc_if(0) free_if(1))
|
nocopy(oc_cut: alloc_if(0) free_if(1))
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -534,7 +534,7 @@ void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|||||||
c_cut_t * oc_cut = c_cut[0];
|
c_cut_t * oc_cut = c_cut[0];
|
||||||
int tp1sq = ntypes*ntypes;
|
int tp1sq = ntypes*ntypes;
|
||||||
if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL &&
|
if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL &&
|
||||||
oc_energy != NULL && ospecial_coul != NULL &&
|
oc_energy != NULL && ospecial_coul != NULL &&
|
||||||
cop >= 0) {
|
cop >= 0) {
|
||||||
#pragma offload_transfer target(mic:cop) \
|
#pragma offload_transfer target(mic:cop) \
|
||||||
nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
|
nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
|
||||||
|
|||||||
@ -51,8 +51,8 @@ class PairBuckCoulCutIntel : public PairBuckCoulCut {
|
|||||||
|
|
||||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
@ -75,7 +75,7 @@ class PairBuckCoulCutIntel : public PairBuckCoulCut {
|
|||||||
~ForceConst() { set_ntypes(0,0,NULL,_cop); }
|
~ForceConst() { set_ntypes(0,0,NULL,_cop); }
|
||||||
|
|
||||||
void set_ntypes(const int ntypes, const int ntable, Memory *memory,
|
void set_ntypes(const int ntypes, const int ntable, Memory *memory,
|
||||||
const int cop);
|
const int cop);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int _ntypes, _ntable, _cop;
|
int _ntypes, _ntable, _cop;
|
||||||
|
|||||||
@ -55,7 +55,7 @@ PairBuckCoulLongIntel::~PairBuckCoulLongIntel()
|
|||||||
void PairBuckCoulLongIntel::compute(int eflag, int vflag)
|
void PairBuckCoulLongIntel::compute(int eflag, int vflag)
|
||||||
{
|
{
|
||||||
if (fix->precision()==FixIntel::PREC_MODE_MIXED)
|
if (fix->precision()==FixIntel::PREC_MODE_MIXED)
|
||||||
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
||||||
force_const_single);
|
force_const_single);
|
||||||
else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
|
else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
|
||||||
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
||||||
@ -70,8 +70,8 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag)
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void PairBuckCoulLongIntel::compute(int eflag, int vflag,
|
void PairBuckCoulLongIntel::compute(int eflag, int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
if (eflag || vflag) {
|
if (eflag || vflag) {
|
||||||
ev_setup(eflag,vflag);
|
ev_setup(eflag,vflag);
|
||||||
@ -85,7 +85,7 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
|
if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
|
||||||
fix->start_watch(TIME_PACK);
|
fix->start_watch(TIME_PACK);
|
||||||
|
|
||||||
int packthreads;
|
int packthreads;
|
||||||
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||||
else packthreads = 1;
|
else packthreads = 1;
|
||||||
@ -94,13 +94,13 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag,
|
|||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||||
packthreads, sizeof(ATOM_T));
|
packthreads, sizeof(ATOM_T));
|
||||||
buffers->thr_pack(ifrom,ito,ago);
|
buffers->thr_pack(ifrom,ito,ago);
|
||||||
}
|
}
|
||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ovflag = 0;
|
int ovflag = 0;
|
||||||
if (vflag_fdotr) ovflag = 2;
|
if (vflag_fdotr) ovflag = 2;
|
||||||
else if (vflag) ovflag = 1;
|
else if (vflag) ovflag = 1;
|
||||||
@ -127,9 +127,9 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
const int astart, const int aend)
|
const int astart, const int aend)
|
||||||
{
|
{
|
||||||
const int inum = aend - astart;
|
const int inum = aend - astart;
|
||||||
if (inum == 0) return;
|
if (inum == 0) return;
|
||||||
@ -175,8 +175,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
int tc;
|
int tc;
|
||||||
FORCE_T * _noalias f_start;
|
FORCE_T * _noalias f_start;
|
||||||
@ -213,7 +213,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
|
in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
|
||||||
in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
|
in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
|
||||||
in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
|
in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
|
||||||
in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \
|
in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \
|
||||||
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
|
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
|
||||||
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
|
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
|
||||||
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
||||||
@ -224,8 +224,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
*timer_compute = MIC_Wtime();
|
*timer_compute = MIC_Wtime();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
||||||
f_stride, x, q);
|
f_stride, x, q);
|
||||||
|
|
||||||
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
||||||
@ -260,24 +260,24 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
const int ptr_off = itype * ntypes;
|
const int ptr_off = itype * ntypes;
|
||||||
const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
|
const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
|
||||||
const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
|
const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
|
||||||
const flt_t * _noalias const rho_invi = rho_inv + ptr_off;
|
const flt_t * _noalias const rho_invi = rho_inv + ptr_off;
|
||||||
|
|
||||||
const int * _noalias const jlist = firstneigh + cnumneigh[i];
|
const int * _noalias const jlist = firstneigh + cnumneigh[i];
|
||||||
const int jnum = numneigh[i];
|
const int jnum = numneigh[i];
|
||||||
|
|
||||||
acc_t fxtmp,fytmp,fztmp,fwtmp;
|
acc_t fxtmp,fytmp,fztmp,fwtmp;
|
||||||
acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
|
acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
|
|
||||||
const flt_t xtmp = x[i].x;
|
const flt_t xtmp = x[i].x;
|
||||||
const flt_t ytmp = x[i].y;
|
const flt_t ytmp = x[i].y;
|
||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
const flt_t qtmp = q[i];
|
const flt_t qtmp = q[i];
|
||||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag == 1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag == 1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
|
|
||||||
int ej = 0;
|
int ej = 0;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
@ -287,33 +287,33 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
const flt_t delx = xtmp - x[j].x;
|
const flt_t delx = xtmp - x[j].x;
|
||||||
const flt_t dely = ytmp - x[j].y;
|
const flt_t dely = ytmp - x[j].y;
|
||||||
const flt_t delz = ztmp - x[j].z;
|
const flt_t delz = ztmp - x[j].z;
|
||||||
const int jtype = x[j].w;
|
const int jtype = x[j].w;
|
||||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||||
|
|
||||||
if (rsq < c_forcei[jtype].cutsq) {
|
if (rsq < c_forcei[jtype].cutsq) {
|
||||||
trsq[ej]=rsq;
|
trsq[ej]=rsq;
|
||||||
tdelx[ej]=delx;
|
tdelx[ej]=delx;
|
||||||
tdely[ej]=dely;
|
tdely[ej]=dely;
|
||||||
tdelz[ej]=delz;
|
tdelz[ej]=delz;
|
||||||
tjtype[ej]=jtype;
|
tjtype[ej]=jtype;
|
||||||
tj[ej]=jlist[jj];
|
tj[ej]=jlist[jj];
|
||||||
ej++;
|
ej++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < ej; jj++) {
|
for (int jj = 0; jj < ej; jj++) {
|
||||||
flt_t forcecoul, forcebuck, evdwl, ecoul;
|
flt_t forcecoul, forcebuck, evdwl, ecoul;
|
||||||
forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0;
|
forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0;
|
||||||
|
|
||||||
const int j = tj[jj] & NEIGHMASK;
|
const int j = tj[jj] & NEIGHMASK;
|
||||||
const int sbindex = tj[jj] >> SBBITS & 3;
|
const int sbindex = tj[jj] >> SBBITS & 3;
|
||||||
const int jtype = tjtype[jj];
|
const int jtype = tjtype[jj];
|
||||||
const flt_t rsq = trsq[jj];
|
const flt_t rsq = trsq[jj];
|
||||||
const flt_t r2inv = (flt_t)1.0 / rsq;
|
const flt_t r2inv = (flt_t)1.0 / rsq;
|
||||||
const flt_t r = (flt_t)1.0 / sqrt(r2inv);
|
const flt_t r = (flt_t)1.0 / sqrt(r2inv);
|
||||||
|
|
||||||
@ -321,52 +321,52 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
if (!ncoultablebits || rsq <= tabinnersq) {
|
if (!ncoultablebits || rsq <= tabinnersq) {
|
||||||
#endif
|
#endif
|
||||||
const flt_t A1 = 0.254829592;
|
const flt_t A1 = 0.254829592;
|
||||||
const flt_t A2 = -0.284496736;
|
const flt_t A2 = -0.284496736;
|
||||||
const flt_t A3 = 1.421413741;
|
const flt_t A3 = 1.421413741;
|
||||||
const flt_t A4 = -1.453152027;
|
const flt_t A4 = -1.453152027;
|
||||||
const flt_t A5 = 1.061405429;
|
const flt_t A5 = 1.061405429;
|
||||||
const flt_t EWALD_F = 1.12837917;
|
const flt_t EWALD_F = 1.12837917;
|
||||||
const flt_t INV_EWALD_P = 1.0 / 0.3275911;
|
const flt_t INV_EWALD_P = 1.0 / 0.3275911;
|
||||||
|
|
||||||
const flt_t grij = g_ewald * r;
|
const flt_t grij = g_ewald * r;
|
||||||
const flt_t expm2 = exp(-grij * grij);
|
const flt_t expm2 = exp(-grij * grij);
|
||||||
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
|
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
|
||||||
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||||
const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
|
const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
|
||||||
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||||
if (EFLAG) ecoul = prefactor * erfc;
|
if (EFLAG) ecoul = prefactor * erfc;
|
||||||
|
|
||||||
|
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
|
||||||
|
prefactor;
|
||||||
|
forcecoul -= adjust;
|
||||||
|
if (EFLAG) ecoul -= adjust;
|
||||||
|
|
||||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
|
|
||||||
prefactor;
|
|
||||||
forcecoul -= adjust;
|
|
||||||
if (EFLAG) ecoul -= adjust;
|
|
||||||
|
|
||||||
#ifdef INTEL_ALLOW_TABLE
|
#ifdef INTEL_ALLOW_TABLE
|
||||||
} else {
|
} else {
|
||||||
float rsq_lookup = rsq;
|
float rsq_lookup = rsq;
|
||||||
const int itable = (__intel_castf32_u32(rsq_lookup) &
|
const int itable = (__intel_castf32_u32(rsq_lookup) &
|
||||||
ncoulmask) >> ncoulshiftbits;
|
ncoulmask) >> ncoulshiftbits;
|
||||||
const flt_t fraction = (rsq_lookup - table[itable].r) *
|
const flt_t fraction = (rsq_lookup - table[itable].r) *
|
||||||
table[itable].dr;
|
table[itable].dr;
|
||||||
|
|
||||||
const flt_t tablet = table[itable].f +
|
const flt_t tablet = table[itable].f +
|
||||||
fraction * table[itable].df;
|
fraction * table[itable].df;
|
||||||
forcecoul = qtmp * q[j] * tablet;
|
forcecoul = qtmp * q[j] * tablet;
|
||||||
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
|
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
|
||||||
fraction * detable[itable]);
|
fraction * detable[itable]);
|
||||||
if (sbindex) {
|
if (sbindex) {
|
||||||
const flt_t table2 = ctable[itable] +
|
const flt_t table2 = ctable[itable] +
|
||||||
fraction * dctable[itable];
|
fraction * dctable[itable];
|
||||||
const flt_t prefactor = qtmp * q[j] * table2;
|
const flt_t prefactor = qtmp * q[j] * table2;
|
||||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
|
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
|
||||||
prefactor;
|
prefactor;
|
||||||
forcecoul -= adjust;
|
forcecoul -= adjust;
|
||||||
if (EFLAG) ecoul -= adjust;
|
if (EFLAG) ecoul -= adjust;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
if (rsq < c_forcei[jtype].cut_ljsq) {
|
if (rsq < c_forcei[jtype].cut_ljsq) {
|
||||||
#endif
|
#endif
|
||||||
flt_t r6inv = r2inv * r2inv * r2inv;
|
flt_t r6inv = r2inv * r2inv * r2inv;
|
||||||
@ -389,7 +389,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
{ forcebuck = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
{ forcebuck = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const flt_t fpair = (forcecoul + forcebuck) * r2inv;
|
const flt_t fpair = (forcecoul + forcebuck) * r2inv;
|
||||||
const flt_t fpx = fpair * tdelx[jj];
|
const flt_t fpx = fpair * tdelx[jj];
|
||||||
fxtmp += fpx;
|
fxtmp += fpx;
|
||||||
if (NEWTON_PAIR) f[j].x -= fpx;
|
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||||
@ -400,38 +400,38 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
fztmp += fpz;
|
fztmp += fpz;
|
||||||
if (NEWTON_PAIR) f[j].z -= fpz;
|
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
sevdwl += evdwl;
|
sevdwl += evdwl;
|
||||||
secoul += ecoul;
|
secoul += ecoul;
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||||
if (NEWTON_PAIR)
|
if (NEWTON_PAIR)
|
||||||
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
|
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
|
||||||
fpx, fpy, fpz);
|
fpx, fpy, fpz);
|
||||||
} // for jj
|
} // for jj
|
||||||
if (NEWTON_PAIR) {
|
if (NEWTON_PAIR) {
|
||||||
f[i].x += fxtmp;
|
f[i].x += fxtmp;
|
||||||
f[i].y += fytmp;
|
f[i].y += fytmp;
|
||||||
f[i].z += fztmp;
|
f[i].z += fztmp;
|
||||||
} else {
|
} else {
|
||||||
f[i].x = fxtmp;
|
f[i].x = fxtmp;
|
||||||
f[i].y = fytmp;
|
f[i].y = fytmp;
|
||||||
f[i].z = fztmp;
|
f[i].z = fztmp;
|
||||||
}
|
}
|
||||||
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||||
} // for ii
|
} // for ii
|
||||||
|
|
||||||
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||||
ov4, ov5);
|
ov4, ov5);
|
||||||
} // end of omp parallel region
|
} // end of omp parallel region
|
||||||
|
|
||||||
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
||||||
@ -440,12 +440,12 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
if (vflag) {
|
if (vflag) {
|
||||||
if (NEWTON_PAIR == 0) {
|
if (NEWTON_PAIR == 0) {
|
||||||
ov0 *= (acc_t)0.5;
|
ov0 *= (acc_t)0.5;
|
||||||
ov1 *= (acc_t)0.5;
|
ov1 *= (acc_t)0.5;
|
||||||
ov2 *= (acc_t)0.5;
|
ov2 *= (acc_t)0.5;
|
||||||
ov3 *= (acc_t)0.5;
|
ov3 *= (acc_t)0.5;
|
||||||
ov4 *= (acc_t)0.5;
|
ov4 *= (acc_t)0.5;
|
||||||
ov5 *= (acc_t)0.5;
|
ov5 *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
ev_global[2] = ov0;
|
ev_global[2] = ov0;
|
||||||
ev_global[3] = ov1;
|
ev_global[3] = ov1;
|
||||||
@ -486,7 +486,7 @@ void PairBuckCoulLongIntel::init_style()
|
|||||||
error->all(FLERR,
|
error->all(FLERR,
|
||||||
"The 'package intel' command is required for /intel styles");
|
"The 'package intel' command is required for /intel styles");
|
||||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||||
|
|
||||||
fix->pair_init_check();
|
fix->pair_init_check();
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
_cop = fix->coprocessor_number();
|
_cop = fix->coprocessor_number();
|
||||||
@ -549,7 +549,7 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
for (int j = 0; j < tp1; j++) {
|
for (int j = 0; j < tp1; j++) {
|
||||||
if (cutsq[i][j] < cut_ljsq[i][j])
|
if (cutsq[i][j] < cut_ljsq[i][j])
|
||||||
error->all(FLERR,
|
error->all(FLERR,
|
||||||
"Intel variant of lj/buck/coul/long expects lj cutoff<=coulombic");
|
"Intel variant of lj/buck/coul/long expects lj cutoff<=coulombic");
|
||||||
fc.c_force[i][j].cutsq = cutsq[i][j];
|
fc.c_force[i][j].cutsq = cutsq[i][j];
|
||||||
fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
|
fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
|
||||||
fc.c_force[i][j].buck1 = buck1[i][j];
|
fc.c_force[i][j].buck1 = buck1[i][j];
|
||||||
@ -603,9 +603,9 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
|
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
void PairBuckCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
void PairBuckCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
||||||
const int ntable,
|
const int ntable,
|
||||||
Memory *memory,
|
Memory *memory,
|
||||||
const int cop) {
|
const int cop) {
|
||||||
if ( (ntypes != _ntypes || ntable != _ntable) ) {
|
if ( (ntypes != _ntypes || ntable != _ntable) ) {
|
||||||
if (_ntypes > 0) {
|
if (_ntypes > 0) {
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
@ -625,10 +625,10 @@ void PairBuckCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|||||||
ospecial_coul != NULL && _cop >= 0) {
|
ospecial_coul != NULL && _cop >= 0) {
|
||||||
#pragma offload_transfer target(mic:cop) \
|
#pragma offload_transfer target(mic:cop) \
|
||||||
nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
|
nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
|
||||||
nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
|
nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
|
||||||
nocopy(orho_inv: alloc_if(0) free_if(1)) \
|
nocopy(orho_inv: alloc_if(0) free_if(1)) \
|
||||||
nocopy(otable: alloc_if(0) free_if(1)) \
|
nocopy(otable: alloc_if(0) free_if(1)) \
|
||||||
nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
|
nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@ -50,8 +50,8 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong {
|
|||||||
|
|
||||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
@ -76,7 +76,7 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong {
|
|||||||
~ForceConst() { set_ntypes(0,0,NULL,_cop); }
|
~ForceConst() { set_ntypes(0,0,NULL,_cop); }
|
||||||
|
|
||||||
void set_ntypes(const int ntypes, const int ntable, Memory *memory,
|
void set_ntypes(const int ntypes, const int ntable, Memory *memory,
|
||||||
const int cop);
|
const int cop);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int _ntypes, _ntable, _cop;
|
int _ntypes, _ntable, _cop;
|
||||||
|
|||||||
@ -48,7 +48,7 @@ PairBuckIntel::~PairBuckIntel()
|
|||||||
void PairBuckIntel::compute(int eflag, int vflag)
|
void PairBuckIntel::compute(int eflag, int vflag)
|
||||||
{
|
{
|
||||||
if (fix->precision()==FixIntel::PREC_MODE_MIXED)
|
if (fix->precision()==FixIntel::PREC_MODE_MIXED)
|
||||||
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
||||||
force_const_single);
|
force_const_single);
|
||||||
else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
|
else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
|
||||||
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
||||||
@ -63,8 +63,8 @@ void PairBuckIntel::compute(int eflag, int vflag)
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void PairBuckIntel::compute(int eflag, int vflag,
|
void PairBuckIntel::compute(int eflag, int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
if (eflag || vflag) {
|
if (eflag || vflag) {
|
||||||
ev_setup(eflag,vflag);
|
ev_setup(eflag,vflag);
|
||||||
@ -87,13 +87,13 @@ void PairBuckIntel::compute(int eflag, int vflag,
|
|||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||||
packthreads, sizeof(ATOM_T));
|
packthreads, sizeof(ATOM_T));
|
||||||
buffers->thr_pack(ifrom,ito,ago);
|
buffers->thr_pack(ifrom,ito,ago);
|
||||||
}
|
}
|
||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ovflag = 0;
|
int ovflag = 0;
|
||||||
if (vflag_fdotr) ovflag = 2;
|
if (vflag_fdotr) ovflag = 2;
|
||||||
else if (vflag) ovflag = 1;
|
else if (vflag) ovflag = 1;
|
||||||
@ -120,9 +120,9 @@ void PairBuckIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void PairBuckIntel::eval(const int offload, const int vflag,
|
void PairBuckIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
const int astart, const int aend)
|
const int astart, const int aend)
|
||||||
{
|
{
|
||||||
const int inum = aend - astart;
|
const int inum = aend - astart;
|
||||||
if (inum == 0) return;
|
if (inum == 0) return;
|
||||||
@ -147,8 +147,8 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
|||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
int tc;
|
int tc;
|
||||||
FORCE_T * _noalias f_start;
|
FORCE_T * _noalias f_start;
|
||||||
@ -160,7 +160,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
|||||||
int *overflow = fix->get_off_overflow_flag();
|
int *overflow = fix->get_off_overflow_flag();
|
||||||
double *timer_compute = fix->off_watch_pair();
|
double *timer_compute = fix->off_watch_pair();
|
||||||
// Redeclare as local variables for offload
|
// Redeclare as local variables for offload
|
||||||
|
|
||||||
if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
|
if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
|
||||||
#pragma offload target(mic:_cop) if(offload) \
|
#pragma offload target(mic:_cop) if(offload) \
|
||||||
in(special_lj:length(0) alloc_if(0) free_if(0)) \
|
in(special_lj:length(0) alloc_if(0) free_if(0)) \
|
||||||
@ -182,8 +182,8 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
|||||||
*timer_compute = MIC_Wtime();
|
*timer_compute = MIC_Wtime();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
||||||
f_stride, x, 0);
|
f_stride, x, 0);
|
||||||
|
|
||||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EFLAG) oevdwl = (acc_t)0;
|
if (EFLAG) oevdwl = (acc_t)0;
|
||||||
@ -215,23 +215,23 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
|||||||
const int jnum = numneigh[i];
|
const int jnum = numneigh[i];
|
||||||
|
|
||||||
acc_t fxtmp,fytmp,fztmp,fwtmp;
|
acc_t fxtmp,fytmp,fztmp,fwtmp;
|
||||||
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
|
|
||||||
const flt_t xtmp = x[i].x;
|
const flt_t xtmp = x[i].x;
|
||||||
const flt_t ytmp = x[i].y;
|
const flt_t ytmp = x[i].y;
|
||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
|
|
||||||
flt_t forcebuck, evdwl;
|
flt_t forcebuck, evdwl;
|
||||||
forcebuck = evdwl = (flt_t)0.0;
|
forcebuck = evdwl = (flt_t)0.0;
|
||||||
|
|
||||||
@ -245,7 +245,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
|||||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||||
const flt_t r = sqrt(rsq);
|
const flt_t r = sqrt(rsq);
|
||||||
const flt_t r2inv = (flt_t)1.0 / rsq;
|
const flt_t r2inv = (flt_t)1.0 / rsq;
|
||||||
|
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
if (rsq < c_forcei[jtype].cutsq) {
|
if (rsq < c_forcei[jtype].cutsq) {
|
||||||
#endif
|
#endif
|
||||||
@ -257,7 +257,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
|||||||
#ifndef INTEL_VMASK
|
#ifndef INTEL_VMASK
|
||||||
if (rsq > c_forcei[jtype].cutsq)
|
if (rsq > c_forcei[jtype].cutsq)
|
||||||
forcebuck =(flt_t)0.0;
|
forcebuck =(flt_t)0.0;
|
||||||
#endif
|
#endif
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
evdwl = rexp * c_energyi[jtype].a -
|
evdwl = rexp * c_energyi[jtype].a -
|
||||||
r6inv * c_energyi[jtype].c -
|
r6inv * c_energyi[jtype].c -
|
||||||
@ -272,67 +272,67 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
|||||||
if (sbindex) {
|
if (sbindex) {
|
||||||
const flt_t factor_lj = special_lj[sbindex];
|
const flt_t factor_lj = special_lj[sbindex];
|
||||||
forcebuck *= factor_lj;
|
forcebuck *= factor_lj;
|
||||||
if (EFLAG)
|
if (EFLAG)
|
||||||
evdwl *= factor_lj;
|
evdwl *= factor_lj;
|
||||||
}
|
}
|
||||||
const flt_t fpair = forcebuck * r2inv;
|
const flt_t fpair = forcebuck * r2inv;
|
||||||
const flt_t fpx = fpair * delx;
|
const flt_t fpx = fpair * delx;
|
||||||
fxtmp += fpx;
|
fxtmp += fpx;
|
||||||
if (NEWTON_PAIR) f[j].x -= fpx;
|
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||||
const flt_t fpy = fpair * dely;
|
const flt_t fpy = fpair * dely;
|
||||||
fytmp += fpy;
|
fytmp += fpy;
|
||||||
if (NEWTON_PAIR) f[j].y -= fpy;
|
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||||
const flt_t fpz = fpair * delz;
|
const flt_t fpz = fpair * delz;
|
||||||
fztmp += fpz;
|
fztmp += fpz;
|
||||||
if (NEWTON_PAIR) f[j].z -= fpz;
|
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
sevdwl += evdwl;
|
sevdwl += evdwl;
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
fwtmp += (flt_t)0.5 * evdwl;
|
fwtmp += (flt_t)0.5 * evdwl;
|
||||||
if (NEWTON_PAIR)
|
if (NEWTON_PAIR)
|
||||||
f[j].w += (flt_t)0.5 * evdwl;
|
f[j].w += (flt_t)0.5 * evdwl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
|
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // for jj
|
} // for jj
|
||||||
if (NEWTON_PAIR) {
|
if (NEWTON_PAIR) {
|
||||||
f[i].x += fxtmp;
|
f[i].x += fxtmp;
|
||||||
f[i].y += fytmp;
|
f[i].y += fytmp;
|
||||||
f[i].z += fztmp;
|
f[i].z += fztmp;
|
||||||
} else {
|
} else {
|
||||||
f[i].x = fxtmp;
|
f[i].x = fxtmp;
|
||||||
f[i].y = fytmp;
|
f[i].y = fytmp;
|
||||||
f[i].z = fztmp;
|
f[i].z = fztmp;
|
||||||
}
|
}
|
||||||
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||||
} // for ii
|
} // for ii
|
||||||
|
|
||||||
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||||
ov4, ov5);
|
ov4, ov5);
|
||||||
} // end of omp parallel region
|
} // end of omp parallel region
|
||||||
|
|
||||||
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
||||||
ev_global[0] = oevdwl;
|
ev_global[0] = oevdwl;
|
||||||
ev_global[1] = (acc_t)0;
|
ev_global[1] = (acc_t)0;
|
||||||
}
|
}
|
||||||
if (vflag) {
|
if (vflag) {
|
||||||
if (NEWTON_PAIR == 0) {
|
if (NEWTON_PAIR == 0) {
|
||||||
ov0 *= (acc_t)0.5;
|
ov0 *= (acc_t)0.5;
|
||||||
ov1 *= (acc_t)0.5;
|
ov1 *= (acc_t)0.5;
|
||||||
ov2 *= (acc_t)0.5;
|
ov2 *= (acc_t)0.5;
|
||||||
ov3 *= (acc_t)0.5;
|
ov3 *= (acc_t)0.5;
|
||||||
ov4 *= (acc_t)0.5;
|
ov4 *= (acc_t)0.5;
|
||||||
ov5 *= (acc_t)0.5;
|
ov5 *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
ev_global[2] = ov0;
|
ev_global[2] = ov0;
|
||||||
ev_global[3] = ov1;
|
ev_global[3] = ov1;
|
||||||
@ -371,7 +371,7 @@ void PairBuckIntel::init_style()
|
|||||||
error->all(FLERR,
|
error->all(FLERR,
|
||||||
"The 'package intel' command is required for /intel styles");
|
"The 'package intel' command is required for /intel styles");
|
||||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||||
|
|
||||||
fix->pair_init_check();
|
fix->pair_init_check();
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
_cop = fix->coprocessor_number();
|
_cop = fix->coprocessor_number();
|
||||||
@ -442,7 +442,7 @@ void PairBuckIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
||||||
Memory *memory,
|
Memory *memory,
|
||||||
const int cop) {
|
const int cop) {
|
||||||
if ( (ntypes != _ntypes ) ) {
|
if ( (ntypes != _ntypes ) ) {
|
||||||
@ -452,8 +452,8 @@ void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|||||||
c_force_t * oc_force = c_force[0];
|
c_force_t * oc_force = c_force[0];
|
||||||
c_energy_t * oc_energy = c_energy[0];
|
c_energy_t * oc_energy = c_energy[0];
|
||||||
|
|
||||||
if (ospecial_lj != NULL && oc_force != NULL &&
|
if (ospecial_lj != NULL && oc_force != NULL &&
|
||||||
oc_energy != NULL &&
|
oc_energy != NULL &&
|
||||||
_cop >= 0) {
|
_cop >= 0) {
|
||||||
#pragma offload_transfer target(mic:cop) \
|
#pragma offload_transfer target(mic:cop) \
|
||||||
nocopy(ospecial_lj: alloc_if(0) free_if(1)) \
|
nocopy(ospecial_lj: alloc_if(0) free_if(1)) \
|
||||||
@ -476,8 +476,8 @@ void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|||||||
c_force_t * oc_force = c_force[0];
|
c_force_t * oc_force = c_force[0];
|
||||||
c_energy_t * oc_energy = c_energy[0];
|
c_energy_t * oc_energy = c_energy[0];
|
||||||
int tp1sq = ntypes*ntypes;
|
int tp1sq = ntypes*ntypes;
|
||||||
if (ospecial_lj != NULL && oc_force != NULL &&
|
if (ospecial_lj != NULL && oc_force != NULL &&
|
||||||
oc_energy != NULL &&
|
oc_energy != NULL &&
|
||||||
cop >= 0) {
|
cop >= 0) {
|
||||||
#pragma offload_transfer target(mic:cop) \
|
#pragma offload_transfer target(mic:cop) \
|
||||||
nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
|
nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
|
||||||
|
|||||||
@ -50,8 +50,8 @@ private:
|
|||||||
|
|
||||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
@ -59,7 +59,7 @@ private:
|
|||||||
|
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
class ForceConst {
|
class ForceConst {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
typedef struct { flt_t buck1, buck2, rhoinv, cutsq; } c_force_t;
|
typedef struct { flt_t buck1, buck2, rhoinv, cutsq; } c_force_t;
|
||||||
typedef struct { flt_t a, c, offset, pad; } c_energy_t;
|
typedef struct { flt_t a, c, offset, pad; } c_energy_t;
|
||||||
@ -78,7 +78,7 @@ private:
|
|||||||
int _ntypes, _cop;
|
int _ntypes, _cop;
|
||||||
Memory *_memory;
|
Memory *_memory;
|
||||||
};
|
};
|
||||||
|
|
||||||
ForceConst<float> force_const_single;
|
ForceConst<float> force_const_single;
|
||||||
ForceConst<double> force_const_double;
|
ForceConst<double> force_const_double;
|
||||||
};
|
};
|
||||||
|
|||||||
@ -74,8 +74,8 @@ void PairEAMIntel::compute(int eflag, int vflag)
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void PairEAMIntel::compute(int eflag, int vflag,
|
void PairEAMIntel::compute(int eflag, int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
if (eflag || vflag) {
|
if (eflag || vflag) {
|
||||||
ev_setup(eflag, vflag);
|
ev_setup(eflag, vflag);
|
||||||
@ -111,37 +111,37 @@ void PairEAMIntel::compute(int eflag, int vflag,
|
|||||||
if (_onetype) {
|
if (_onetype) {
|
||||||
if (eflag) {
|
if (eflag) {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (eflag) {
|
if (eflag) {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -151,8 +151,8 @@ void PairEAMIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void PairEAMIntel::eval(const int offload, const int vflag,
|
void PairEAMIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
const int astart, const int aend)
|
const int astart, const int aend)
|
||||||
{
|
{
|
||||||
const int inum = aend - astart;
|
const int inum = aend - astart;
|
||||||
@ -251,8 +251,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int iifrom, iito, tid;
|
int iifrom, iito, tid;
|
||||||
IP_PRE_omp_range_id_vec(iifrom, iito, tid, inum, nthreads,
|
IP_PRE_omp_range_id_vec(iifrom, iito, tid, inum, nthreads,
|
||||||
INTEL_VECTOR_WIDTH);
|
INTEL_VECTOR_WIDTH);
|
||||||
iifrom += astart;
|
iifrom += astart;
|
||||||
iito += astart;
|
iito += astart;
|
||||||
|
|
||||||
@ -264,8 +264,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
else foff = 0;
|
else foff = 0;
|
||||||
double * _noalias const trho = rho + foff;
|
double * _noalias const trho = rho + foff;
|
||||||
if (NEWTON_PAIR) {
|
if (NEWTON_PAIR) {
|
||||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||||
memset(trho, 0, nall * sizeof(double));
|
memset(trho, 0, nall * sizeof(double));
|
||||||
}
|
}
|
||||||
|
|
||||||
const int toffs = tid * ccache_stride;
|
const int toffs = tid * ccache_stride;
|
||||||
@ -280,108 +280,108 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
int rhor_joff, frho_ioff;
|
int rhor_joff, frho_ioff;
|
||||||
if (ONETYPE) {
|
if (ONETYPE) {
|
||||||
const int ptr_off=_onetype * ntypes + _onetype;
|
const int ptr_off=_onetype * ntypes + _onetype;
|
||||||
oscale = scale_f[ptr_off];
|
oscale = scale_f[ptr_off];
|
||||||
int rhor_ioff = istride * _onetype;
|
int rhor_ioff = istride * _onetype;
|
||||||
rhor_joff = rhor_ioff + _onetype * jstride;
|
rhor_joff = rhor_ioff + _onetype * jstride;
|
||||||
frho_ioff = fstride * _onetype;
|
frho_ioff = fstride * _onetype;
|
||||||
}
|
}
|
||||||
for (int i = iifrom; i < iito; ++i) {
|
for (int i = iifrom; i < iito; ++i) {
|
||||||
int itype, rhor_ioff;
|
int itype, rhor_ioff;
|
||||||
if (!ONETYPE) {
|
if (!ONETYPE) {
|
||||||
itype = x[i].w;
|
itype = x[i].w;
|
||||||
rhor_ioff = istride * itype;
|
rhor_ioff = istride * itype;
|
||||||
}
|
}
|
||||||
const int * _noalias const jlist = firstneigh + cnumneigh[i];
|
const int * _noalias const jlist = firstneigh + cnumneigh[i];
|
||||||
const int jnum = numneigh[i];
|
const int jnum = numneigh[i];
|
||||||
|
|
||||||
const flt_t xtmp = x[i].x;
|
const flt_t xtmp = x[i].x;
|
||||||
const flt_t ytmp = x[i].y;
|
const flt_t ytmp = x[i].y;
|
||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
|
|
||||||
acc_t rhoi = (acc_t)0.0;
|
acc_t rhoi = (acc_t)0.0;
|
||||||
int ej = 0;
|
int ej = 0;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
const int j = jlist[jj] & NEIGHMASK;
|
const int j = jlist[jj] & NEIGHMASK;
|
||||||
const flt_t delx = xtmp - x[j].x;
|
const flt_t delx = xtmp - x[j].x;
|
||||||
const flt_t dely = ytmp - x[j].y;
|
const flt_t dely = ytmp - x[j].y;
|
||||||
const flt_t delz = ztmp - x[j].z;
|
const flt_t delz = ztmp - x[j].z;
|
||||||
const flt_t rsq = delx*delx + dely*dely + delz*delz;
|
const flt_t rsq = delx*delx + dely*dely + delz*delz;
|
||||||
|
|
||||||
if (rsq < fcutforcesq) {
|
if (rsq < fcutforcesq) {
|
||||||
trsq[ej]=rsq;
|
trsq[ej]=rsq;
|
||||||
if (!ONETYPE) tjtype[ej]=x[j].w;
|
if (!ONETYPE) tjtype[ej]=x[j].w;
|
||||||
tj[ej]=jlist[jj];
|
tj[ej]=jlist[jj];
|
||||||
ej++;
|
ej++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:rhoi)
|
#pragma simd reduction(+:rhoi)
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < ej; jj++) {
|
for (int jj = 0; jj < ej; jj++) {
|
||||||
int jtype;
|
int jtype;
|
||||||
const int j = tj[jj] & NEIGHMASK;
|
const int j = tj[jj] & NEIGHMASK;
|
||||||
if (!ONETYPE) jtype = tjtype[jj];
|
if (!ONETYPE) jtype = tjtype[jj];
|
||||||
const flt_t rsq = trsq[jj];
|
const flt_t rsq = trsq[jj];
|
||||||
flt_t p = sqrt(rsq)*frdr + (flt_t)1.0;
|
flt_t p = sqrt(rsq)*frdr + (flt_t)1.0;
|
||||||
int m = static_cast<int> (p);
|
int m = static_cast<int> (p);
|
||||||
m = MIN(m,nr-1);
|
m = MIN(m,nr-1);
|
||||||
p -= m;
|
p -= m;
|
||||||
p = MIN(p,(flt_t)1.0);
|
p = MIN(p,(flt_t)1.0);
|
||||||
if (!ONETYPE)
|
if (!ONETYPE)
|
||||||
rhor_joff = rhor_ioff + jtype * jstride;
|
rhor_joff = rhor_ioff + jtype * jstride;
|
||||||
const int joff = rhor_joff + m;
|
const int joff = rhor_joff + m;
|
||||||
flt_t ra;
|
flt_t ra;
|
||||||
ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p +
|
ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p +
|
||||||
rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d;
|
rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d;
|
||||||
rhoi += ra;
|
rhoi += ra;
|
||||||
if (NEWTON_PAIR) {
|
if (NEWTON_PAIR) {
|
||||||
if (!ONETYPE) {
|
if (!ONETYPE) {
|
||||||
const int ioff = jtype * istride + itype * jstride + m;
|
const int ioff = jtype * istride + itype * jstride + m;
|
||||||
ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p +
|
ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p +
|
||||||
rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d;
|
rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d;
|
||||||
}
|
}
|
||||||
trho[j] += ra;
|
trho[j] += ra;
|
||||||
}
|
}
|
||||||
} // for jj
|
} // for jj
|
||||||
if (NEWTON_PAIR)
|
if (NEWTON_PAIR)
|
||||||
trho[i] += rhoi;
|
trho[i] += rhoi;
|
||||||
else
|
else
|
||||||
trho[i] = rhoi;
|
trho[i] = rhoi;
|
||||||
} // for i
|
} // for i
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
if (NEWTON_PAIR && nthreads > 1) {
|
if (NEWTON_PAIR && nthreads > 1) {
|
||||||
#pragma omp barrier
|
#pragma omp barrier
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
const int rcount = nall;
|
const int rcount = nall;
|
||||||
if (nthreads == 2) {
|
if (nthreads == 2) {
|
||||||
double *trho2 = rho + nmax;
|
double *trho2 = rho + nmax;
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd
|
#pragma simd
|
||||||
for (int n = 0; n < rcount; n++)
|
for (int n = 0; n < rcount; n++)
|
||||||
rho[n] += trho2[n];
|
rho[n] += trho2[n];
|
||||||
} else if (nthreads == 4) {
|
} else if (nthreads == 4) {
|
||||||
double *trho2 = rho + nmax;
|
double *trho2 = rho + nmax;
|
||||||
double *trho3 = trho2 + nmax;
|
double *trho3 = trho2 + nmax;
|
||||||
double *trho4 = trho3 + nmax;
|
double *trho4 = trho3 + nmax;
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd
|
#pragma simd
|
||||||
for (int n = 0; n < rcount; n++)
|
for (int n = 0; n < rcount; n++)
|
||||||
rho[n] += trho2[n] + trho3[n] + trho4[n];
|
rho[n] += trho2[n] + trho3[n] + trho4[n];
|
||||||
} else {
|
} else {
|
||||||
double *trhon = rho + nmax;
|
double *trhon = rho + nmax;
|
||||||
for (int t = 1; t < nthreads; t++) {
|
for (int t = 1; t < nthreads; t++) {
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd
|
#pragma simd
|
||||||
for (int n = 0; n < rcount; n++)
|
for (int n = 0; n < rcount; n++)
|
||||||
rho[n] += trhon[n];
|
rho[n] += trhon[n];
|
||||||
trhon += nmax;
|
trhon += nmax;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -411,32 +411,32 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
#pragma simd reduction(+:tevdwl)
|
#pragma simd reduction(+:tevdwl)
|
||||||
#endif
|
#endif
|
||||||
for (int i = iifrom; i < iito; ++i) {
|
for (int i = iifrom; i < iito; ++i) {
|
||||||
int itype;
|
int itype;
|
||||||
if (!ONETYPE) itype = x[i].w;
|
if (!ONETYPE) itype = x[i].w;
|
||||||
flt_t p = rho[i]*frdrho + (flt_t)1.0;
|
flt_t p = rho[i]*frdrho + (flt_t)1.0;
|
||||||
int m = static_cast<int> (p);
|
int m = static_cast<int> (p);
|
||||||
m = MAX(1,MIN(m,nrho-1));
|
m = MAX(1,MIN(m,nrho-1));
|
||||||
p -= m;
|
p -= m;
|
||||||
p = MIN(p,(flt_t)1.0);
|
p = MIN(p,(flt_t)1.0);
|
||||||
if (!ONETYPE) frho_ioff = itype * fstride;
|
if (!ONETYPE) frho_ioff = itype * fstride;
|
||||||
const int ioff = frho_ioff + m;
|
const int ioff = frho_ioff + m;
|
||||||
fp_f[i] = (frho_spline_f[ioff].a*p + frho_spline_f[ioff].b)*p +
|
fp_f[i] = (frho_spline_f[ioff].a*p + frho_spline_f[ioff].b)*p +
|
||||||
frho_spline_f[ioff].c;
|
frho_spline_f[ioff].c;
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
flt_t phi = ((frho_spline_e[ioff].a*p + frho_spline_e[ioff].b)*p +
|
flt_t phi = ((frho_spline_e[ioff].a*p + frho_spline_e[ioff].b)*p +
|
||||||
frho_spline_e[ioff].c)*p + frho_spline_e[ioff].d;
|
frho_spline_e[ioff].c)*p + frho_spline_e[ioff].d;
|
||||||
if (rho[i] > frhomax) phi += fp_f[i] * (rho[i]-frhomax);
|
if (rho[i] > frhomax) phi += fp_f[i] * (rho[i]-frhomax);
|
||||||
if (!ONETYPE) {
|
if (!ONETYPE) {
|
||||||
const int ptr_off=itype*ntypes + itype;
|
const int ptr_off=itype*ntypes + itype;
|
||||||
oscale = scale_f[ptr_off];
|
oscale = scale_f[ptr_off];
|
||||||
}
|
}
|
||||||
phi *= oscale;
|
phi *= oscale;
|
||||||
tevdwl += phi;
|
tevdwl += phi;
|
||||||
if (eatom) f[i].w += phi;
|
if (eatom) f[i].w += phi;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (EFLAG) oevdwl += tevdwl;
|
if (EFLAG) oevdwl += tevdwl;
|
||||||
|
|
||||||
|
|
||||||
// communicate derivative of embedding function
|
// communicate derivative of embedding function
|
||||||
|
|
||||||
@ -447,7 +447,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
if (tid == 0)
|
if (tid == 0)
|
||||||
comm->forward_comm_pair(this);
|
comm->forward_comm_pair(this);
|
||||||
if (NEWTON_PAIR)
|
if (NEWTON_PAIR)
|
||||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp barrier
|
#pragma omp barrier
|
||||||
@ -458,94 +458,94 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
for (int i = iifrom; i < iito; ++i) {
|
for (int i = iifrom; i < iito; ++i) {
|
||||||
int itype, rhor_ioff;
|
int itype, rhor_ioff;
|
||||||
const flt_t * _noalias scale_fi;
|
const flt_t * _noalias scale_fi;
|
||||||
if (!ONETYPE) {
|
if (!ONETYPE) {
|
||||||
itype = x[i].w;
|
itype = x[i].w;
|
||||||
rhor_ioff = istride * itype;
|
rhor_ioff = istride * itype;
|
||||||
scale_fi = scale_f + itype*ntypes;
|
scale_fi = scale_f + itype*ntypes;
|
||||||
}
|
}
|
||||||
const int * _noalias const jlist = firstneigh + cnumneigh[i];
|
const int * _noalias const jlist = firstneigh + cnumneigh[i];
|
||||||
const int jnum = numneigh[i];
|
const int jnum = numneigh[i];
|
||||||
|
|
||||||
acc_t fxtmp, fytmp, fztmp, fwtmp;
|
acc_t fxtmp, fytmp, fztmp, fwtmp;
|
||||||
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
|
|
||||||
const flt_t xtmp = x[i].x;
|
const flt_t xtmp = x[i].x;
|
||||||
const flt_t ytmp = x[i].y;
|
const flt_t ytmp = x[i].y;
|
||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
|
|
||||||
int ej = 0;
|
int ej = 0;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
const int j = jlist[jj] & NEIGHMASK;
|
const int j = jlist[jj] & NEIGHMASK;
|
||||||
const flt_t delx = xtmp - x[j].x;
|
const flt_t delx = xtmp - x[j].x;
|
||||||
const flt_t dely = ytmp - x[j].y;
|
const flt_t dely = ytmp - x[j].y;
|
||||||
const flt_t delz = ztmp - x[j].z;
|
const flt_t delz = ztmp - x[j].z;
|
||||||
const flt_t rsq = delx*delx + dely*dely + delz*delz;
|
const flt_t rsq = delx*delx + dely*dely + delz*delz;
|
||||||
|
|
||||||
if (rsq < fcutforcesq) {
|
if (rsq < fcutforcesq) {
|
||||||
trsq[ej]=rsq;
|
trsq[ej]=rsq;
|
||||||
tdelx[ej]=delx;
|
tdelx[ej]=delx;
|
||||||
tdely[ej]=dely;
|
tdely[ej]=dely;
|
||||||
tdelz[ej]=delz;
|
tdelz[ej]=delz;
|
||||||
if (!ONETYPE) tjtype[ej]=x[j].w;
|
if (!ONETYPE) tjtype[ej]=x[j].w;
|
||||||
tj[ej]=jlist[jj];
|
tj[ej]=jlist[jj];
|
||||||
ej++;
|
ej++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < ej; jj++) {
|
for (int jj = 0; jj < ej; jj++) {
|
||||||
int jtype;
|
int jtype;
|
||||||
const int j = tj[jj] & NEIGHMASK;
|
const int j = tj[jj] & NEIGHMASK;
|
||||||
if (!ONETYPE) jtype = tjtype[jj];
|
if (!ONETYPE) jtype = tjtype[jj];
|
||||||
const flt_t rsq = trsq[jj];
|
const flt_t rsq = trsq[jj];
|
||||||
const flt_t r = sqrt(rsq);
|
const flt_t r = sqrt(rsq);
|
||||||
flt_t p = r*frdr + (flt_t)1.0;
|
flt_t p = r*frdr + (flt_t)1.0;
|
||||||
int m = static_cast<int> (p);
|
int m = static_cast<int> (p);
|
||||||
m = MIN(m,nr-1);
|
m = MIN(m,nr-1);
|
||||||
p -= m;
|
p -= m;
|
||||||
p = MIN(p,(flt_t)1.0);
|
p = MIN(p,(flt_t)1.0);
|
||||||
if (!ONETYPE)
|
if (!ONETYPE)
|
||||||
rhor_joff = rhor_ioff + jtype * jstride;
|
rhor_joff = rhor_ioff + jtype * jstride;
|
||||||
const int joff = rhor_joff + m;
|
const int joff = rhor_joff + m;
|
||||||
const flt_t rhojp = (rhor_spline_f[joff].a*p +
|
const flt_t rhojp = (rhor_spline_f[joff].a*p +
|
||||||
rhor_spline_f[joff].b)*p +
|
rhor_spline_f[joff].b)*p +
|
||||||
rhor_spline_f[joff].c;
|
rhor_spline_f[joff].c;
|
||||||
flt_t rhoip;
|
flt_t rhoip;
|
||||||
if (!ONETYPE) {
|
if (!ONETYPE) {
|
||||||
const int ioff = jtype * istride + itype * jstride + m;
|
const int ioff = jtype * istride + itype * jstride + m;
|
||||||
rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p +
|
rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p +
|
||||||
rhor_spline_f[ioff].c;
|
rhor_spline_f[ioff].c;
|
||||||
} else
|
} else
|
||||||
rhoip = rhojp;
|
rhoip = rhojp;
|
||||||
const flt_t z2p = (z2r_spline_t[joff].a*p +
|
const flt_t z2p = (z2r_spline_t[joff].a*p +
|
||||||
z2r_spline_t[joff].b)*p +
|
z2r_spline_t[joff].b)*p +
|
||||||
z2r_spline_t[joff].c;
|
z2r_spline_t[joff].c;
|
||||||
const flt_t z2 = ((z2r_spline_t[joff].d*p +
|
const flt_t z2 = ((z2r_spline_t[joff].d*p +
|
||||||
z2r_spline_t[joff].e)*p +
|
z2r_spline_t[joff].e)*p +
|
||||||
z2r_spline_t[joff].f)*p +
|
z2r_spline_t[joff].f)*p +
|
||||||
z2r_spline_t[joff].g;
|
z2r_spline_t[joff].g;
|
||||||
|
|
||||||
const flt_t recip = (flt_t)1.0/r;
|
const flt_t recip = (flt_t)1.0/r;
|
||||||
const flt_t phi = z2*recip;
|
const flt_t phi = z2*recip;
|
||||||
const flt_t phip = z2p*recip - phi*recip;
|
const flt_t phip = z2p*recip - phi*recip;
|
||||||
const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip;
|
const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip;
|
||||||
if (!ONETYPE)
|
if (!ONETYPE)
|
||||||
oscale = scale_fi[jtype];
|
oscale = scale_fi[jtype];
|
||||||
const flt_t fpair = -oscale*psip*recip;
|
const flt_t fpair = -oscale*psip*recip;
|
||||||
|
|
||||||
const flt_t fpx = fpair * tdelx[jj];
|
const flt_t fpx = fpair * tdelx[jj];
|
||||||
fxtmp += fpx;
|
fxtmp += fpx;
|
||||||
if (NEWTON_PAIR) f[j].x -= fpx;
|
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||||
@ -556,20 +556,20 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
fztmp += fpz;
|
fztmp += fpz;
|
||||||
if (NEWTON_PAIR) f[j].z -= fpz;
|
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
const flt_t evdwl = oscale*phi;
|
const flt_t evdwl = oscale*phi;
|
||||||
sevdwl += evdwl;
|
sevdwl += evdwl;
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
fwtmp += (flt_t)0.5 * evdwl;
|
fwtmp += (flt_t)0.5 * evdwl;
|
||||||
if (NEWTON_PAIR)
|
if (NEWTON_PAIR)
|
||||||
f[j].w += (flt_t)0.5 * evdwl;
|
f[j].w += (flt_t)0.5 * evdwl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
|
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
|
||||||
fpx, fpy, fpz);
|
fpx, fpy, fpz);
|
||||||
} // for jj
|
} // for jj
|
||||||
if (NEWTON_PAIR) {
|
if (NEWTON_PAIR) {
|
||||||
f[i].x += fxtmp;
|
f[i].x += fxtmp;
|
||||||
f[i].y += fytmp;
|
f[i].y += fytmp;
|
||||||
f[i].z += fztmp;
|
f[i].z += fztmp;
|
||||||
@ -577,19 +577,19 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
f[i].x = fxtmp;
|
f[i].x = fxtmp;
|
||||||
f[i].y = fytmp;
|
f[i].y = fytmp;
|
||||||
f[i].z = fztmp;
|
f[i].z = fztmp;
|
||||||
sevdwl *= (acc_t)0.5;
|
sevdwl *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
|
|
||||||
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||||
} // for i
|
} // for i
|
||||||
|
|
||||||
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||||
ov4, ov5);
|
ov4, ov5);
|
||||||
} /// omp
|
} /// omp
|
||||||
|
|
||||||
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
ev_global[0] = oevdwl;
|
ev_global[0] = oevdwl;
|
||||||
@ -597,13 +597,13 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
if (vflag) {
|
if (vflag) {
|
||||||
if (NEWTON_PAIR == 0) {
|
if (NEWTON_PAIR == 0) {
|
||||||
ov0 *= (acc_t)0.5;
|
ov0 *= (acc_t)0.5;
|
||||||
ov1 *= (acc_t)0.5;
|
ov1 *= (acc_t)0.5;
|
||||||
ov2 *= (acc_t)0.5;
|
ov2 *= (acc_t)0.5;
|
||||||
ov3 *= (acc_t)0.5;
|
ov3 *= (acc_t)0.5;
|
||||||
ov4 *= (acc_t)0.5;
|
ov4 *= (acc_t)0.5;
|
||||||
ov5 *= (acc_t)0.5;
|
ov5 *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
ev_global[2] = ov0;
|
ev_global[2] = ov0;
|
||||||
ev_global[3] = ov1;
|
ev_global[3] = ov1;
|
||||||
ev_global[4] = ov2;
|
ev_global[4] = ov2;
|
||||||
@ -665,7 +665,7 @@ void PairEAMIntel::init_style()
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
|
void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers)
|
IntelBuffers<flt_t,acc_t> *buffers)
|
||||||
{
|
{
|
||||||
int off_ccache = 0;
|
int off_ccache = 0;
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
@ -684,14 +684,14 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
for (int i = 1; i <= atom->ntypes; i++) {
|
for (int i = 1; i <= atom->ntypes; i++) {
|
||||||
for (int j = i; j <= atom->ntypes; j++) {
|
for (int j = i; j <= atom->ntypes; j++) {
|
||||||
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
|
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
|
||||||
cut = init_one(i,j);
|
cut = init_one(i,j);
|
||||||
cutneigh = cut + neighbor->skin;
|
cutneigh = cut + neighbor->skin;
|
||||||
cutsq[i][j] = cutsq[j][i] = cut*cut;
|
cutsq[i][j] = cutsq[j][i] = cut*cut;
|
||||||
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
|
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
_onetype=-1;
|
_onetype=-1;
|
||||||
double oldscale=-1;
|
double oldscale=-1;
|
||||||
for (int i = 1; i < tp1; i++) {
|
for (int i = 1; i < tp1; i++) {
|
||||||
@ -709,32 +709,32 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
for (int j = 1; j < tp1; j++) {
|
for (int j = 1; j < tp1; j++) {
|
||||||
fc.scale_f[i][j] = scale[i][j];
|
fc.scale_f[i][j] = scale[i][j];
|
||||||
if (type2rhor[i][j] >= 0) {
|
if (type2rhor[i][j] >= 0) {
|
||||||
const int joff = ioff + j * fc.rhor_jstride();
|
const int joff = ioff + j * fc.rhor_jstride();
|
||||||
for (int k = 0; k < nr + 1; k++) {
|
for (int k = 0; k < nr + 1; k++) {
|
||||||
if (type2rhor[j][i] != type2rhor[i][j])
|
if (type2rhor[j][i] != type2rhor[i][j])
|
||||||
_onetype = 0;
|
_onetype = 0;
|
||||||
else if (_onetype < 0)
|
else if (_onetype < 0)
|
||||||
_onetype = i;
|
_onetype = i;
|
||||||
if (oldscale < 0)
|
if (oldscale < 0)
|
||||||
oldscale = scale[i][j];
|
oldscale = scale[i][j];
|
||||||
else
|
else
|
||||||
if (oldscale != scale[i][j])
|
if (oldscale != scale[i][j])
|
||||||
_onetype = 0;
|
_onetype = 0;
|
||||||
fc.rhor_spline_f[joff + k].a=rhor_spline[type2rhor[j][i]][k][0];
|
fc.rhor_spline_f[joff + k].a=rhor_spline[type2rhor[j][i]][k][0];
|
||||||
fc.rhor_spline_f[joff + k].b=rhor_spline[type2rhor[j][i]][k][1];
|
fc.rhor_spline_f[joff + k].b=rhor_spline[type2rhor[j][i]][k][1];
|
||||||
fc.rhor_spline_f[joff + k].c=rhor_spline[type2rhor[j][i]][k][2];
|
fc.rhor_spline_f[joff + k].c=rhor_spline[type2rhor[j][i]][k][2];
|
||||||
fc.rhor_spline_e[joff + k].a=rhor_spline[type2rhor[j][i]][k][3];
|
fc.rhor_spline_e[joff + k].a=rhor_spline[type2rhor[j][i]][k][3];
|
||||||
fc.rhor_spline_e[joff + k].b=rhor_spline[type2rhor[j][i]][k][4];
|
fc.rhor_spline_e[joff + k].b=rhor_spline[type2rhor[j][i]][k][4];
|
||||||
fc.rhor_spline_e[joff + k].c=rhor_spline[type2rhor[j][i]][k][5];
|
fc.rhor_spline_e[joff + k].c=rhor_spline[type2rhor[j][i]][k][5];
|
||||||
fc.rhor_spline_e[joff + k].d=rhor_spline[type2rhor[j][i]][k][6];
|
fc.rhor_spline_e[joff + k].d=rhor_spline[type2rhor[j][i]][k][6];
|
||||||
fc.z2r_spline_t[joff + k].a=z2r_spline[type2rhor[j][i]][k][0];
|
fc.z2r_spline_t[joff + k].a=z2r_spline[type2rhor[j][i]][k][0];
|
||||||
fc.z2r_spline_t[joff + k].b=z2r_spline[type2rhor[j][i]][k][1];
|
fc.z2r_spline_t[joff + k].b=z2r_spline[type2rhor[j][i]][k][1];
|
||||||
fc.z2r_spline_t[joff + k].c=z2r_spline[type2rhor[j][i]][k][2];
|
fc.z2r_spline_t[joff + k].c=z2r_spline[type2rhor[j][i]][k][2];
|
||||||
fc.z2r_spline_t[joff + k].d=z2r_spline[type2rhor[j][i]][k][3];
|
fc.z2r_spline_t[joff + k].d=z2r_spline[type2rhor[j][i]][k][3];
|
||||||
fc.z2r_spline_t[joff + k].e=z2r_spline[type2rhor[j][i]][k][4];
|
fc.z2r_spline_t[joff + k].e=z2r_spline[type2rhor[j][i]][k][4];
|
||||||
fc.z2r_spline_t[joff + k].f=z2r_spline[type2rhor[j][i]][k][5];
|
fc.z2r_spline_t[joff + k].f=z2r_spline[type2rhor[j][i]][k][5];
|
||||||
fc.z2r_spline_t[joff + k].g=z2r_spline[type2rhor[j][i]][k][6];
|
fc.z2r_spline_t[joff + k].g=z2r_spline[type2rhor[j][i]][k][6];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -745,9 +745,9 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
|
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
||||||
const int nr, const int nrho,
|
const int nr, const int nrho,
|
||||||
Memory *memory,
|
Memory *memory,
|
||||||
const int cop) {
|
const int cop) {
|
||||||
if (ntypes != _ntypes || nr > _nr || nrho > _nrho) {
|
if (ntypes != _ntypes || nr > _nr || nrho > _nrho) {
|
||||||
if (_ntypes > 0) {
|
if (_ntypes > 0) {
|
||||||
_memory->destroy(rhor_spline_f);
|
_memory->destroy(rhor_spline_f);
|
||||||
@ -780,7 +780,7 @@ void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf,
|
int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf,
|
||||||
int pbc_flag, int *pbc)
|
int pbc_flag, int *pbc)
|
||||||
{
|
{
|
||||||
if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||||
return pack_forward_comm(n, list, buf, fp);
|
return pack_forward_comm(n, list, buf, fp);
|
||||||
@ -802,7 +802,7 @@ void PairEAMIntel::unpack_forward_comm(int n, int first, double *buf)
|
|||||||
|
|
||||||
template<class flt_t>
|
template<class flt_t>
|
||||||
int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf,
|
int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf,
|
||||||
flt_t *fp_f)
|
flt_t *fp_f)
|
||||||
{
|
{
|
||||||
int i,j,m;
|
int i,j,m;
|
||||||
|
|
||||||
@ -817,8 +817,8 @@ int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf,
|
|||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template<class flt_t>
|
template<class flt_t>
|
||||||
void PairEAMIntel::unpack_forward_comm(int n, int first, double *buf,
|
void PairEAMIntel::unpack_forward_comm(int n, int first, double *buf,
|
||||||
flt_t *fp_f)
|
flt_t *fp_f)
|
||||||
{
|
{
|
||||||
int i,m,last;
|
int i,m,last;
|
||||||
|
|
||||||
|
|||||||
@ -53,8 +53,8 @@ class PairEAMIntel : public PairEAM {
|
|||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t,
|
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t,
|
||||||
class acc_t>
|
class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||||
@ -79,8 +79,8 @@ class PairEAMIntel : public PairEAM {
|
|||||||
ForceConst() : _ntypes(0), _nr(0) {}
|
ForceConst() : _ntypes(0), _nr(0) {}
|
||||||
~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
|
~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
|
||||||
|
|
||||||
void set_ntypes(const int ntypes, const int nr, const int nrho,
|
void set_ntypes(const int ntypes, const int nr, const int nrho,
|
||||||
Memory *memory, const int cop);
|
Memory *memory, const int cop);
|
||||||
inline int rhor_jstride() const { return _nr; }
|
inline int rhor_jstride() const { return _nr; }
|
||||||
inline int rhor_istride() const { return _nr * _ntypes; }
|
inline int rhor_istride() const { return _nr * _ntypes; }
|
||||||
inline int frho_stride() const { return _nrho; }
|
inline int frho_stride() const { return _nrho; }
|
||||||
|
|||||||
@ -98,17 +98,17 @@ void PairGayBerneIntel::compute(int eflag, int vflag,
|
|||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, packthreads,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, packthreads,
|
||||||
sizeof(ATOM_T));
|
sizeof(ATOM_T));
|
||||||
if (ago != 0) buffers->thr_pack(ifrom,ito,ago);
|
if (ago != 0) buffers->thr_pack(ifrom,ito,ago);
|
||||||
|
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
int qi = ellipsoid[i];
|
int qi = ellipsoid[i];
|
||||||
if (qi > -1) {
|
if (qi > -1) {
|
||||||
quat[i].w = bonus[qi].quat[0];
|
quat[i].w = bonus[qi].quat[0];
|
||||||
quat[i].i = bonus[qi].quat[1];
|
quat[i].i = bonus[qi].quat[1];
|
||||||
quat[i].j = bonus[qi].quat[2];
|
quat[i].j = bonus[qi].quat[2];
|
||||||
quat[i].k = bonus[qi].quat[3];
|
quat[i].k = bonus[qi].quat[3];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
quat[nall].w = (flt_t)1.0;
|
quat[nall].w = (flt_t)1.0;
|
||||||
@ -161,65 +161,65 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
if (fix->separate_buffers()) {
|
if (fix->separate_buffers()) {
|
||||||
fix->start_watch(TIME_PACK);
|
fix->start_watch(TIME_PACK);
|
||||||
if (offload) {
|
if (offload) {
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
int nthreads = comm->nthreads;
|
int nthreads = comm->nthreads;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,
|
||||||
nthreads, sizeof(ATOM_T));
|
nthreads, sizeof(ATOM_T));
|
||||||
if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0);
|
if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0);
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
int qi = ellipsoid[i];
|
int qi = ellipsoid[i];
|
||||||
if (qi > -1) {
|
if (qi > -1) {
|
||||||
quat[i].w = bonus[qi].quat[0];
|
quat[i].w = bonus[qi].quat[0];
|
||||||
quat[i].i = bonus[qi].quat[1];
|
quat[i].i = bonus[qi].quat[1];
|
||||||
quat[i].j = bonus[qi].quat[2];
|
quat[i].j = bonus[qi].quat[2];
|
||||||
quat[i].k = bonus[qi].quat[3];
|
quat[i].k = bonus[qi].quat[3];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int nghost = nall - nlocal;
|
int nghost = nall - nlocal;
|
||||||
if (nghost) {
|
if (nghost) {
|
||||||
IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,
|
IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,
|
||||||
nthreads, sizeof(ATOM_T));
|
nthreads, sizeof(ATOM_T));
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
ifrom += nlocal;
|
ifrom += nlocal;
|
||||||
ito += nlocal;
|
ito += nlocal;
|
||||||
if (ago != 0) {
|
if (ago != 0) {
|
||||||
offset = fix->offload_min_ghost() - nlocal;
|
offset = fix->offload_min_ghost() - nlocal;
|
||||||
buffers->thr_pack_cop(ifrom, ito, offset, ago == 1);
|
buffers->thr_pack_cop(ifrom, ito, offset, ago == 1);
|
||||||
}
|
}
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
int qi = ellipsoid[i + offset];
|
int qi = ellipsoid[i + offset];
|
||||||
if (qi > -1) {
|
if (qi > -1) {
|
||||||
quat[i].w = bonus[qi].quat[0];
|
quat[i].w = bonus[qi].quat[0];
|
||||||
quat[i].i = bonus[qi].quat[1];
|
quat[i].i = bonus[qi].quat[1];
|
||||||
quat[i].j = bonus[qi].quat[2];
|
quat[i].j = bonus[qi].quat[2];
|
||||||
quat[i].k = bonus[qi].quat[3];
|
quat[i].k = bonus[qi].quat[3];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (ago != 0) buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);
|
if (ago != 0) buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);
|
||||||
for (int i = fix->host_min_local(); i < nlocal; i++) {
|
for (int i = fix->host_min_local(); i < nlocal; i++) {
|
||||||
int qi = ellipsoid[i];
|
int qi = ellipsoid[i];
|
||||||
if (qi > -1) {
|
if (qi > -1) {
|
||||||
quat[i].w = bonus[qi].quat[0];
|
quat[i].w = bonus[qi].quat[0];
|
||||||
quat[i].i = bonus[qi].quat[1];
|
quat[i].i = bonus[qi].quat[1];
|
||||||
quat[i].j = bonus[qi].quat[2];
|
quat[i].j = bonus[qi].quat[2];
|
||||||
quat[i].k = bonus[qi].quat[3];
|
quat[i].k = bonus[qi].quat[3];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int offset = fix->host_min_ghost() - nlocal;
|
int offset = fix->host_min_ghost() - nlocal;
|
||||||
if (ago != 0) buffers->thr_pack_host(nlocal, nall, offset);
|
if (ago != 0) buffers->thr_pack_host(nlocal, nall, offset);
|
||||||
for (int i = nlocal; i < nall; i++) {
|
for (int i = nlocal; i < nall; i++) {
|
||||||
int qi = ellipsoid[i + offset];
|
int qi = ellipsoid[i + offset];
|
||||||
if (qi > -1) {
|
if (qi > -1) {
|
||||||
quat[i].w = bonus[qi].quat[0];
|
quat[i].w = bonus[qi].quat[0];
|
||||||
quat[i].i = bonus[qi].quat[1];
|
quat[i].i = bonus[qi].quat[1];
|
||||||
quat[i].j = bonus[qi].quat[2];
|
quat[i].j = bonus[qi].quat[2];
|
||||||
quat[i].k = bonus[qi].quat[3];
|
quat[i].k = bonus[qi].quat[3];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
@ -252,8 +252,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
int tc;
|
int tc;
|
||||||
FORCE_T * _noalias f_start;
|
FORCE_T * _noalias f_start;
|
||||||
@ -303,26 +303,26 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (separate_flag) {
|
if (separate_flag) {
|
||||||
if (separate_flag < 3) {
|
if (separate_flag < 3) {
|
||||||
int all_local = nlocal;
|
int all_local = nlocal;
|
||||||
int ghost_min = overflow[LMP_GHOST_MIN];
|
int ghost_min = overflow[LMP_GHOST_MIN];
|
||||||
nlocal = overflow[LMP_LOCAL_MAX] + 1;
|
nlocal = overflow[LMP_LOCAL_MAX] + 1;
|
||||||
int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;
|
int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;
|
||||||
if (nghost < 0) nghost = 0;
|
if (nghost < 0) nghost = 0;
|
||||||
nall = nlocal + nghost;
|
nall = nlocal + nghost;
|
||||||
separate_flag--;
|
separate_flag--;
|
||||||
int flength;
|
int flength;
|
||||||
if (NEWTON_PAIR) flength = nall;
|
if (NEWTON_PAIR) flength = nall;
|
||||||
else flength = nlocal;
|
else flength = nlocal;
|
||||||
IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),
|
IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),
|
||||||
separate_flag);
|
separate_flag);
|
||||||
if (nghost) {
|
if (nghost) {
|
||||||
if (nlocal < all_local || ghost_min > all_local) {
|
if (nlocal < all_local || ghost_min > all_local) {
|
||||||
memmove(x + nlocal, x + ghost_min,
|
memmove(x + nlocal, x + ghost_min,
|
||||||
(nall - nlocal) * sizeof(ATOM_T));
|
(nall - nlocal) * sizeof(ATOM_T));
|
||||||
memmove(quat + nlocal, quat + ghost_min,
|
memmove(quat + nlocal, quat + ghost_min,
|
||||||
(nall - nlocal) * sizeof(QUAT_T));
|
(nall - nlocal) * sizeof(QUAT_T));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
x[nall].x = (flt_t)INTEL_BIGP;
|
x[nall].x = (flt_t)INTEL_BIGP;
|
||||||
x[nall].y = (flt_t)INTEL_BIGP;
|
x[nall].y = (flt_t)INTEL_BIGP;
|
||||||
@ -395,17 +395,17 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
|
fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
|
||||||
|
|
||||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
|
if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
|
|
||||||
bool multiple_forms = false;
|
bool multiple_forms = false;
|
||||||
int packed_j = 0;
|
int packed_j = 0;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
int jm = jlist[jj];
|
int jm = jlist[jj];
|
||||||
int j = jm & NEIGHMASK;
|
int j = jm & NEIGHMASK;
|
||||||
const int jtype = x[j].w;
|
const int jtype = x[j].w;
|
||||||
@ -428,27 +428,27 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
} else
|
} else
|
||||||
multiple_forms = true;
|
multiple_forms = true;
|
||||||
}
|
}
|
||||||
const int edge = (packed_j % pad_width);
|
const int edge = (packed_j % pad_width);
|
||||||
if (edge) {
|
if (edge) {
|
||||||
const int packed_end = packed_j + (pad_width - edge);
|
const int packed_end = packed_j + (pad_width - edge);
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count min=1, max=15, avg=8
|
#pragma loop_count min=1, max=15, avg=8
|
||||||
#endif
|
#endif
|
||||||
for ( ; packed_j < packed_end; packed_j++)
|
for ( ; packed_j < packed_end; packed_j++)
|
||||||
jlist_form[packed_j] = nall;
|
jlist_form[packed_j] = nall;
|
||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------------------------------------------
|
// -------------------------------------------------------------
|
||||||
|
|
||||||
#ifdef INTEL_V512
|
#ifdef INTEL_V512
|
||||||
__assume(packed_j % INTEL_VECTOR_WIDTH == 0);
|
__assume(packed_j % INTEL_VECTOR_WIDTH == 0);
|
||||||
__assume(packed_j % 8 == 0);
|
__assume(packed_j % 8 == 0);
|
||||||
__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
|
__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
|
||||||
#endif
|
#endif
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
|
#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
|
||||||
sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
|
sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < packed_j; jj++) {
|
for (int jj = 0; jj < packed_j; jj++) {
|
||||||
flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
|
flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
|
||||||
@ -458,15 +458,15 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
flt_t fforce_0, fforce_1, fforce_2, ttor_0, ttor_1, ttor_2;
|
flt_t fforce_0, fforce_1, fforce_2, ttor_0, ttor_1, ttor_2;
|
||||||
flt_t rtor_0, rtor_1, rtor_2;
|
flt_t rtor_0, rtor_1, rtor_2;
|
||||||
|
|
||||||
const int sbindex = jlist_form[jj] >> SBBITS & 3;
|
const int sbindex = jlist_form[jj] >> SBBITS & 3;
|
||||||
const int j = jlist_form[jj] & NEIGHMASK;
|
const int j = jlist_form[jj] & NEIGHMASK;
|
||||||
flt_t factor_lj = special_lj[sbindex];
|
flt_t factor_lj = special_lj[sbindex];
|
||||||
const int jtype = jtype_form[jj];
|
const int jtype = jtype_form[jj];
|
||||||
const flt_t sigma = ijci[jtype].sigma;
|
const flt_t sigma = ijci[jtype].sigma;
|
||||||
const flt_t epsilon = ijci[jtype].epsilon;
|
const flt_t epsilon = ijci[jtype].epsilon;
|
||||||
const flt_t shape2_0 = ic[jtype].shape2[0];
|
const flt_t shape2_0 = ic[jtype].shape2[0];
|
||||||
const flt_t shape2_1 = ic[jtype].shape2[1];
|
const flt_t shape2_1 = ic[jtype].shape2[1];
|
||||||
const flt_t shape2_2 = ic[jtype].shape2[2];
|
const flt_t shape2_2 = ic[jtype].shape2[2];
|
||||||
flt_t one_eng, evdwl;
|
flt_t one_eng, evdwl;
|
||||||
|
|
||||||
ME_quat_to_mat_trans(quat[j], a2);
|
ME_quat_to_mat_trans(quat[j], a2);
|
||||||
@ -488,7 +488,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
ME_plus3(g1, g2, g12);
|
ME_plus3(g1, g2, g12);
|
||||||
flt_t kappa_0, kappa_1, kappa_2;
|
flt_t kappa_0, kappa_1, kappa_2;
|
||||||
ME_mldivide3(g12, delx_form[jj], dely_form[jj], delz_form[jj],
|
ME_mldivide3(g12, delx_form[jj], dely_form[jj], delz_form[jj],
|
||||||
kappa, ierror);
|
kappa, ierror);
|
||||||
|
|
||||||
// tempv = G12^-1*r12hat
|
// tempv = G12^-1*r12hat
|
||||||
|
|
||||||
@ -520,7 +520,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
flt_t iota_0, iota_1, iota_2;
|
flt_t iota_0, iota_1, iota_2;
|
||||||
ME_plus3(b1, b2, b12);
|
ME_plus3(b1, b2, b12);
|
||||||
ME_mldivide3(b12, delx_form[jj], dely_form[jj], delz_form[jj],
|
ME_mldivide3(b12, delx_form[jj], dely_form[jj], delz_form[jj],
|
||||||
iota, ierror);
|
iota, ierror);
|
||||||
|
|
||||||
// tempv = G12^-1*r12hat
|
// tempv = G12^-1*r12hat
|
||||||
|
|
||||||
@ -534,7 +534,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
// compute dUr/dr
|
// compute dUr/dr
|
||||||
|
|
||||||
temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
|
temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
|
||||||
sigma;
|
sigma;
|
||||||
temp1 = temp1 * (flt_t)24.0 * epsilon;
|
temp1 = temp1 * (flt_t)24.0 * epsilon;
|
||||||
flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
|
flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
|
||||||
flt_t dUr_0, dUr_1, dUr_2;
|
flt_t dUr_0, dUr_1, dUr_2;
|
||||||
@ -548,8 +548,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
flt_t dchi_0, dchi_1, dchi_2;
|
flt_t dchi_0, dchi_1, dchi_2;
|
||||||
temp1 = ME_dot3(iota, r12hat);
|
temp1 = ME_dot3(iota, r12hat);
|
||||||
temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
|
temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
|
||||||
std::pow(chi, (mu - (flt_t)1.0) / mu);
|
std::pow(chi, (mu - (flt_t)1.0) / mu);
|
||||||
dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
|
dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
|
||||||
dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
|
dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
|
||||||
dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
|
dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
|
||||||
@ -663,36 +663,36 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
temp3 = chi * eta;
|
temp3 = chi * eta;
|
||||||
|
|
||||||
ttor_0 = (temp1 * dchi_0 + temp2 * deta_0 + temp3 * dUr_0) *
|
ttor_0 = (temp1 * dchi_0 + temp2 * deta_0 + temp3 * dUr_0) *
|
||||||
(flt_t)-1.0;
|
(flt_t)-1.0;
|
||||||
ttor_1 = (temp1 * dchi_1 + temp2 * deta_1 + temp3 * dUr_1) *
|
ttor_1 = (temp1 * dchi_1 + temp2 * deta_1 + temp3 * dUr_1) *
|
||||||
(flt_t)-1.0;
|
(flt_t)-1.0;
|
||||||
ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) *
|
ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) *
|
||||||
(flt_t)-1.0;
|
(flt_t)-1.0;
|
||||||
|
|
||||||
if (NEWTON_PAIR) {
|
if (NEWTON_PAIR) {
|
||||||
rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) *
|
rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) *
|
||||||
(flt_t)-1.0;
|
(flt_t)-1.0;
|
||||||
rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) *
|
rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) *
|
||||||
(flt_t)-1.0;
|
(flt_t)-1.0;
|
||||||
rtor_2 = (temp1 * dchi2_2 + temp2 * deta2_2 + temp3 * dUr2_2) *
|
rtor_2 = (temp1 * dchi2_2 + temp2 * deta2_2 + temp3 * dUr2_2) *
|
||||||
(flt_t)-1.0;
|
(flt_t)-1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
one_eng = temp1 * chi;
|
one_eng = temp1 * chi;
|
||||||
#ifndef INTEL_VMASK
|
#ifndef INTEL_VMASK
|
||||||
if (jlist_form[jj] == nall) {
|
if (jlist_form[jj] == nall) {
|
||||||
one_eng = (flt_t)0.0;
|
one_eng = (flt_t)0.0;
|
||||||
fforce_0 = 0.0;
|
fforce_0 = 0.0;
|
||||||
fforce_1 = 0.0;
|
fforce_1 = 0.0;
|
||||||
fforce_2 = 0.0;
|
fforce_2 = 0.0;
|
||||||
ttor_0 = 0.0;
|
ttor_0 = 0.0;
|
||||||
ttor_1 = 0.0;
|
ttor_1 = 0.0;
|
||||||
ttor_2 = 0.0;
|
ttor_2 = 0.0;
|
||||||
rtor_0 = 0.0;
|
rtor_0 = 0.0;
|
||||||
rtor_1 = 0.0;
|
rtor_1 = 0.0;
|
||||||
rtor_2 = 0.0;
|
rtor_2 = 0.0;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
fforce_0 *= factor_lj;
|
fforce_0 *= factor_lj;
|
||||||
fforce_1 *= factor_lj;
|
fforce_1 *= factor_lj;
|
||||||
@ -701,53 +701,53 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
ttor_1 *= factor_lj;
|
ttor_1 *= factor_lj;
|
||||||
ttor_2 *= factor_lj;
|
ttor_2 *= factor_lj;
|
||||||
|
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
if (jlist_form[jj] < nall) {
|
if (jlist_form[jj] < nall) {
|
||||||
#endif
|
#endif
|
||||||
fxtmp += fforce_0;
|
fxtmp += fforce_0;
|
||||||
fytmp += fforce_1;
|
fytmp += fforce_1;
|
||||||
fztmp += fforce_2;
|
fztmp += fforce_2;
|
||||||
t1tmp += ttor_0;
|
t1tmp += ttor_0;
|
||||||
t2tmp += ttor_1;
|
t2tmp += ttor_1;
|
||||||
t3tmp += ttor_2;
|
t3tmp += ttor_2;
|
||||||
|
|
||||||
if (NEWTON_PAIR) {
|
if (NEWTON_PAIR) {
|
||||||
rtor_0 *= factor_lj;
|
rtor_0 *= factor_lj;
|
||||||
rtor_1 *= factor_lj;
|
rtor_1 *= factor_lj;
|
||||||
rtor_2 *= factor_lj;
|
rtor_2 *= factor_lj;
|
||||||
int jp = j * 2;
|
int jp = j * 2;
|
||||||
f[jp].x -= fforce_0;
|
f[jp].x -= fforce_0;
|
||||||
f[jp].y -= fforce_1;
|
f[jp].y -= fforce_1;
|
||||||
f[jp].z -= fforce_2;
|
f[jp].z -= fforce_2;
|
||||||
jp++;
|
jp++;
|
||||||
f[jp].x += rtor_0;
|
f[jp].x += rtor_0;
|
||||||
f[jp].y += rtor_1;
|
f[jp].y += rtor_1;
|
||||||
f[jp].z += rtor_2;
|
f[jp].z += rtor_2;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
evdwl = factor_lj * one_eng;
|
evdwl = factor_lj * one_eng;
|
||||||
sevdwl += evdwl;
|
sevdwl += evdwl;
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
fwtmp += (flt_t)0.5 * evdwl;
|
fwtmp += (flt_t)0.5 * evdwl;
|
||||||
if (NEWTON_PAIR)
|
if (NEWTON_PAIR)
|
||||||
f[j*2].w += (flt_t)0.5 * evdwl;
|
f[j*2].w += (flt_t)0.5 * evdwl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_PAIR == 0) {
|
if (NEWTON_PAIR == 0) {
|
||||||
if (vflag == 1) {
|
if (vflag == 1) {
|
||||||
sv0 += delx_form[jj] * fforce_0;
|
sv0 += delx_form[jj] * fforce_0;
|
||||||
sv1 += dely_form[jj] * fforce_1;
|
sv1 += dely_form[jj] * fforce_1;
|
||||||
sv2 += delz_form[jj] * fforce_2;
|
sv2 += delz_form[jj] * fforce_2;
|
||||||
sv3 += delx_form[jj] * fforce_1;
|
sv3 += delx_form[jj] * fforce_1;
|
||||||
sv4 += delx_form[jj] * fforce_2;
|
sv4 += delx_form[jj] * fforce_2;
|
||||||
sv5 += dely_form[jj] * fforce_2;
|
sv5 += dely_form[jj] * fforce_2;
|
||||||
}
|
}
|
||||||
} // EVFLAG
|
} // EVFLAG
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // for jj
|
} // for jj
|
||||||
|
|
||||||
// -------------------------------------------------------------
|
// -------------------------------------------------------------
|
||||||
@ -756,29 +756,29 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
ierror = 2;
|
ierror = 2;
|
||||||
|
|
||||||
int ip = i * 2;
|
int ip = i * 2;
|
||||||
if (NEWTON_PAIR) {
|
if (NEWTON_PAIR) {
|
||||||
f[ip].x += fxtmp;
|
f[ip].x += fxtmp;
|
||||||
f[ip].y += fytmp;
|
f[ip].y += fytmp;
|
||||||
f[ip].z += fztmp;
|
f[ip].z += fztmp;
|
||||||
ip++;
|
ip++;
|
||||||
f[ip].x += t1tmp;
|
f[ip].x += t1tmp;
|
||||||
f[ip].y += t2tmp;
|
f[ip].y += t2tmp;
|
||||||
f[ip].z += t3tmp;
|
f[ip].z += t3tmp;
|
||||||
} else {
|
} else {
|
||||||
f[ip].x = fxtmp;
|
f[ip].x = fxtmp;
|
||||||
f[ip].y = fytmp;
|
f[ip].y = fytmp;
|
||||||
f[ip].z = fztmp;
|
f[ip].z = fztmp;
|
||||||
ip++;
|
ip++;
|
||||||
f[ip].x = t1tmp;
|
f[ip].x = t1tmp;
|
||||||
f[ip].y = t2tmp;
|
f[ip].y = t2tmp;
|
||||||
f[ip].z = t3tmp;
|
f[ip].z = t3tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
oevdwl += sevdwl;
|
oevdwl += sevdwl;
|
||||||
if (eatom) f[i * 2].w += fwtmp;
|
if (eatom) f[i * 2].w += fwtmp;
|
||||||
}
|
}
|
||||||
if (NEWTON_PAIR == 0) {
|
if (NEWTON_PAIR == 0) {
|
||||||
if (vflag == 1) {
|
if (vflag == 1) {
|
||||||
ov0 += sv0;
|
ov0 += sv0;
|
||||||
ov1 += sv1;
|
ov1 += sv1;
|
||||||
@ -792,30 +792,30 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
int o_range;
|
int o_range;
|
||||||
if (NEWTON_PAIR) {
|
if (NEWTON_PAIR) {
|
||||||
o_range = nall;
|
o_range = nall;
|
||||||
if (offload == 0) o_range -= minlocal;
|
if (offload == 0) o_range -= minlocal;
|
||||||
IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
|
IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
|
||||||
sizeof(FORCE_T));
|
sizeof(FORCE_T));
|
||||||
const int sto = iito * 8;
|
const int sto = iito * 8;
|
||||||
const int fst4 = f_stride * 4;
|
const int fst4 = f_stride * 4;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp barrier
|
#pragma omp barrier
|
||||||
#endif
|
#endif
|
||||||
acc_t *f_scalar = &f_start[0].x;
|
acc_t *f_scalar = &f_start[0].x;
|
||||||
acc_t *f_scalar2 = f_scalar + fst4;
|
acc_t *f_scalar2 = f_scalar + fst4;
|
||||||
for (int t = 1; t < nthreads; t++) {
|
for (int t = 1; t < nthreads; t++) {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
for (int n = iifrom * 8; n < sto; n++)
|
for (int n = iifrom * 8; n < sto; n++)
|
||||||
f_scalar[n] += f_scalar2[n];
|
f_scalar[n] += f_scalar2[n];
|
||||||
f_scalar2 += fst4;
|
f_scalar2 += fst4;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (vflag==2) {
|
if (vflag==2) {
|
||||||
const ATOM_T * _noalias const xo = x + minlocal;
|
const ATOM_T * _noalias const xo = x + minlocal;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma novector
|
#pragma novector
|
||||||
#endif
|
#endif
|
||||||
for (int n = iifrom; n < iito; n++) {
|
for (int n = iifrom; n < iito; n++) {
|
||||||
const int nt2 = n * 2;
|
const int nt2 = n * 2;
|
||||||
@ -826,7 +826,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
ov4 += f_start[nt2].z * xo[n].x;
|
ov4 += f_start[nt2].z * xo[n].x;
|
||||||
ov5 += f_start[nt2].z * xo[n].y;
|
ov5 += f_start[nt2].z * xo[n].y;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ierror)
|
if (ierror)
|
||||||
@ -840,12 +840,12 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
if (vflag) {
|
if (vflag) {
|
||||||
if (NEWTON_PAIR == 0) {
|
if (NEWTON_PAIR == 0) {
|
||||||
ov0 *= (acc_t)-0.5;
|
ov0 *= (acc_t)-0.5;
|
||||||
ov1 *= (acc_t)-0.5;
|
ov1 *= (acc_t)-0.5;
|
||||||
ov2 *= (acc_t)-0.5;
|
ov2 *= (acc_t)-0.5;
|
||||||
ov3 *= (acc_t)-0.5;
|
ov3 *= (acc_t)-0.5;
|
||||||
ov4 *= (acc_t)-0.5;
|
ov4 *= (acc_t)-0.5;
|
||||||
ov5 *= (acc_t)-0.5;
|
ov5 *= (acc_t)-0.5;
|
||||||
}
|
}
|
||||||
ev_global[2] = ov0;
|
ev_global[2] = ov0;
|
||||||
ev_global[3] = ov1;
|
ev_global[3] = ov1;
|
||||||
@ -982,7 +982,7 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|||||||
const int one_length,
|
const int one_length,
|
||||||
const int nthreads,
|
const int nthreads,
|
||||||
Memory *memory,
|
Memory *memory,
|
||||||
const int cop) {
|
const int cop) {
|
||||||
if (ntypes != _ntypes) {
|
if (ntypes != _ntypes) {
|
||||||
if (_ntypes > 0) {
|
if (_ntypes > 0) {
|
||||||
fc_packed3 *oic = ic;
|
fc_packed3 *oic = ic;
|
||||||
@ -999,9 +999,9 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|||||||
int * ojlist_form = jlist_form[0];
|
int * ojlist_form = jlist_form[0];
|
||||||
|
|
||||||
if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
|
if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
|
||||||
orsq_form != NULL && odelx_form != NULL && odely_form != NULL &&
|
orsq_form != NULL && odelx_form != NULL && odely_form != NULL &&
|
||||||
odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL &&
|
odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL &&
|
||||||
_cop >= 0) {
|
_cop >= 0) {
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(ospecial_lj, oijc, olj34, oic: alloc_if(0) free_if(1)) \
|
nocopy(ospecial_lj, oijc, olj34, oic: alloc_if(0) free_if(1)) \
|
||||||
nocopy(orsq_form, odelx_form, odely_form: alloc_if(0) free_if(1)) \
|
nocopy(orsq_form, odelx_form, odely_form: alloc_if(0) free_if(1)) \
|
||||||
@ -1033,14 +1033,14 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|||||||
memory->create(jlist_form, nthreads, one_length, "jlist_form");
|
memory->create(jlist_form, nthreads, one_length, "jlist_form");
|
||||||
|
|
||||||
for (int zn = 0; zn < nthreads; zn++)
|
for (int zn = 0; zn < nthreads; zn++)
|
||||||
for (int zo = 0; zo < one_length; zo++) {
|
for (int zo = 0; zo < one_length; zo++) {
|
||||||
rsq_form[zn][zo] = 10.0;
|
rsq_form[zn][zo] = 10.0;
|
||||||
delx_form[zn][zo] = 10.0;
|
delx_form[zn][zo] = 10.0;
|
||||||
dely_form[zn][zo] = 10.0;
|
dely_form[zn][zo] = 10.0;
|
||||||
delz_form[zn][zo] = 10.0;
|
delz_form[zn][zo] = 10.0;
|
||||||
jtype_form[zn][zo] = 1;
|
jtype_form[zn][zo] = 1;
|
||||||
jlist_form[zn][zo] = 0;
|
jlist_form[zn][zo] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
flt_t * ospecial_lj = special_lj;
|
flt_t * ospecial_lj = special_lj;
|
||||||
@ -1057,9 +1057,9 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|||||||
|
|
||||||
int tp1sq = ntypes*ntypes;
|
int tp1sq = ntypes*ntypes;
|
||||||
if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
|
if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
|
||||||
oic != NULL && orsq_form != NULL && odelx_form != NULL &&
|
oic != NULL && orsq_form != NULL && odelx_form != NULL &&
|
||||||
odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL &&
|
odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL &&
|
||||||
ojlist_form !=NULL && cop >= 0) {
|
ojlist_form !=NULL && cop >= 0) {
|
||||||
#pragma offload_transfer target(mic:cop) \
|
#pragma offload_transfer target(mic:cop) \
|
||||||
nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
|
nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
|
||||||
nocopy(oijc,olj34: length(tp1sq) alloc_if(1) free_if(0)) \
|
nocopy(oijc,olj34: length(tp1sq) alloc_if(1) free_if(0)) \
|
||||||
|
|||||||
@ -67,8 +67,8 @@ void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag)
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
|
void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
if (eflag || vflag) {
|
if (eflag || vflag) {
|
||||||
ev_setup(eflag,vflag);
|
ev_setup(eflag,vflag);
|
||||||
@ -125,9 +125,9 @@ void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
const int astart, const int aend)
|
const int astart, const int aend)
|
||||||
{
|
{
|
||||||
const int inum = aend - astart;
|
const int inum = aend - astart;
|
||||||
if (inum == 0) return;
|
if (inum == 0) return;
|
||||||
@ -177,8 +177,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
int tc;
|
int tc;
|
||||||
FORCE_T * _noalias f_start;
|
FORCE_T * _noalias f_start;
|
||||||
@ -227,7 +227,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
||||||
f_stride, x, q);
|
f_stride, x, q);
|
||||||
|
|
||||||
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
||||||
@ -259,7 +259,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
int * _noalias const tjtype = ccachej + toffs;
|
int * _noalias const tjtype = ccachej + toffs;
|
||||||
|
|
||||||
for (int i = iifrom; i < iito; i += iip) {
|
for (int i = iifrom; i < iito; i += iip) {
|
||||||
// const int i = ilist[ii];
|
// const int i = ilist[ii];
|
||||||
const int itype = x[i].w;
|
const int itype = x[i].w;
|
||||||
|
|
||||||
const int ptr_off = itype * ntypes;
|
const int ptr_off = itype * ntypes;
|
||||||
@ -270,175 +270,175 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
const int jnum = numneigh[i];
|
const int jnum = numneigh[i];
|
||||||
|
|
||||||
acc_t fxtmp,fytmp,fztmp,fwtmp;
|
acc_t fxtmp,fytmp,fztmp,fwtmp;
|
||||||
acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
|
acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
|
|
||||||
const flt_t xtmp = x[i].x;
|
const flt_t xtmp = x[i].x;
|
||||||
const flt_t ytmp = x[i].y;
|
const flt_t ytmp = x[i].y;
|
||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
const flt_t qtmp = q[i];
|
const flt_t qtmp = q[i];
|
||||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
|
|
||||||
int ej = 0;
|
int ej = 0;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
const int j = jlist[jj] & NEIGHMASK;
|
const int j = jlist[jj] & NEIGHMASK;
|
||||||
const flt_t delx = xtmp - x[j].x;
|
const flt_t delx = xtmp - x[j].x;
|
||||||
const flt_t dely = ytmp - x[j].y;
|
const flt_t dely = ytmp - x[j].y;
|
||||||
const flt_t delz = ztmp - x[j].z;
|
const flt_t delz = ztmp - x[j].z;
|
||||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||||
|
|
||||||
if (rsq < cut_coulsq) {
|
if (rsq < cut_coulsq) {
|
||||||
trsq[ej]=rsq;
|
trsq[ej]=rsq;
|
||||||
tdelx[ej]=delx;
|
tdelx[ej]=delx;
|
||||||
tdely[ej]=dely;
|
tdely[ej]=dely;
|
||||||
tdelz[ej]=delz;
|
tdelz[ej]=delz;
|
||||||
tjtype[ej]=x[j].w;
|
tjtype[ej]=x[j].w;
|
||||||
tj[ej]=jlist[jj];
|
tj[ej]=jlist[jj];
|
||||||
ej++;
|
ej++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < ej; jj++) {
|
for (int jj = 0; jj < ej; jj++) {
|
||||||
flt_t forcecoul, forcelj, evdwl, ecoul;
|
flt_t forcecoul, forcelj, evdwl, ecoul;
|
||||||
forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
|
forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
|
||||||
|
|
||||||
const int j = tj[jj] & NEIGHMASK;
|
const int j = tj[jj] & NEIGHMASK;
|
||||||
const int sbindex = tj[jj] >> SBBITS & 3;
|
const int sbindex = tj[jj] >> SBBITS & 3;
|
||||||
const int jtype = tjtype[jj];
|
const int jtype = tjtype[jj];
|
||||||
const flt_t rsq = trsq[jj];
|
const flt_t rsq = trsq[jj];
|
||||||
const flt_t r2inv = (flt_t)1.0 / rsq;
|
const flt_t r2inv = (flt_t)1.0 / rsq;
|
||||||
|
|
||||||
#ifdef INTEL_ALLOW_TABLE
|
#ifdef INTEL_ALLOW_TABLE
|
||||||
if (!ncoultablebits || rsq <= tabinnersq) {
|
if (!ncoultablebits || rsq <= tabinnersq) {
|
||||||
#endif
|
#endif
|
||||||
const flt_t A1 = 0.254829592;
|
const flt_t A1 = 0.254829592;
|
||||||
const flt_t A2 = -0.284496736;
|
const flt_t A2 = -0.284496736;
|
||||||
const flt_t A3 = 1.421413741;
|
const flt_t A3 = 1.421413741;
|
||||||
const flt_t A4 = -1.453152027;
|
const flt_t A4 = -1.453152027;
|
||||||
const flt_t A5 = 1.061405429;
|
const flt_t A5 = 1.061405429;
|
||||||
const flt_t EWALD_F = 1.12837917;
|
const flt_t EWALD_F = 1.12837917;
|
||||||
const flt_t INV_EWALD_P = 1.0 / 0.3275911;
|
const flt_t INV_EWALD_P = 1.0 / 0.3275911;
|
||||||
|
|
||||||
const flt_t r = (flt_t)1.0 / sqrt(r2inv);
|
const flt_t r = (flt_t)1.0 / sqrt(r2inv);
|
||||||
const flt_t grij = g_ewald * r;
|
const flt_t grij = g_ewald * r;
|
||||||
const flt_t expm2 = exp(-grij * grij);
|
const flt_t expm2 = exp(-grij * grij);
|
||||||
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
|
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
|
||||||
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||||
const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
|
const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
|
||||||
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||||
if (EFLAG) ecoul = prefactor * erfc;
|
if (EFLAG) ecoul = prefactor * erfc;
|
||||||
|
|
||||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
|
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
|
||||||
prefactor;
|
prefactor;
|
||||||
forcecoul -= adjust;
|
forcecoul -= adjust;
|
||||||
if (EFLAG) ecoul -= adjust;
|
if (EFLAG) ecoul -= adjust;
|
||||||
|
|
||||||
#ifdef INTEL_ALLOW_TABLE
|
#ifdef INTEL_ALLOW_TABLE
|
||||||
} else {
|
} else {
|
||||||
float rsq_lookup = rsq;
|
float rsq_lookup = rsq;
|
||||||
const int itable = (__intel_castf32_u32(rsq_lookup) &
|
const int itable = (__intel_castf32_u32(rsq_lookup) &
|
||||||
ncoulmask) >> ncoulshiftbits;
|
ncoulmask) >> ncoulshiftbits;
|
||||||
const flt_t fraction = (rsq_lookup - table[itable].r) *
|
const flt_t fraction = (rsq_lookup - table[itable].r) *
|
||||||
table[itable].dr;
|
table[itable].dr;
|
||||||
|
|
||||||
const flt_t tablet = table[itable].f +
|
const flt_t tablet = table[itable].f +
|
||||||
fraction * table[itable].df;
|
fraction * table[itable].df;
|
||||||
forcecoul = qtmp * q[j] * tablet;
|
forcecoul = qtmp * q[j] * tablet;
|
||||||
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
|
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
|
||||||
fraction * detable[itable]);
|
fraction * detable[itable]);
|
||||||
if (sbindex) {
|
if (sbindex) {
|
||||||
const flt_t table2 = ctable[itable] +
|
const flt_t table2 = ctable[itable] +
|
||||||
fraction * dctable[itable];
|
fraction * dctable[itable];
|
||||||
const flt_t prefactor = qtmp * q[j] * table2;
|
const flt_t prefactor = qtmp * q[j] * table2;
|
||||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
|
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
|
||||||
prefactor;
|
prefactor;
|
||||||
forcecoul -= adjust;
|
forcecoul -= adjust;
|
||||||
if (EFLAG) ecoul -= adjust;
|
if (EFLAG) ecoul -= adjust;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
if (rsq < cut_ljsq) {
|
if (rsq < cut_ljsq) {
|
||||||
#endif
|
#endif
|
||||||
flt_t r6inv = r2inv * r2inv * r2inv;
|
flt_t r6inv = r2inv * r2inv * r2inv;
|
||||||
forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
|
forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
|
||||||
if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
|
if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
|
||||||
|
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
if (rsq > cut_lj_innersq) {
|
if (rsq > cut_lj_innersq) {
|
||||||
#endif
|
#endif
|
||||||
const flt_t drsq = cut_ljsq - rsq;
|
const flt_t drsq = cut_ljsq - rsq;
|
||||||
const flt_t cut2 = (rsq - cut_lj_innersq) * drsq;
|
const flt_t cut2 = (rsq - cut_lj_innersq) * drsq;
|
||||||
const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) *
|
const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) *
|
||||||
inv_denom_lj;
|
inv_denom_lj;
|
||||||
const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj;
|
const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj;
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
#ifndef INTEL_VMASK
|
#ifndef INTEL_VMASK
|
||||||
if (rsq > cut_lj_innersq) {
|
if (rsq > cut_lj_innersq) {
|
||||||
#endif
|
#endif
|
||||||
forcelj = forcelj * switch1 + evdwl * switch2;
|
forcelj = forcelj * switch1 + evdwl * switch2;
|
||||||
evdwl *= switch1;
|
evdwl *= switch1;
|
||||||
#ifndef INTEL_VMASK
|
#ifndef INTEL_VMASK
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
const flt_t philj = r6inv * (lji[jtype].z*r6inv -
|
const flt_t philj = r6inv * (lji[jtype].z*r6inv -
|
||||||
lji[jtype].w);
|
lji[jtype].w);
|
||||||
#ifndef INTEL_VMASK
|
#ifndef INTEL_VMASK
|
||||||
if (rsq > cut_lj_innersq)
|
if (rsq > cut_lj_innersq)
|
||||||
#endif
|
#endif
|
||||||
forcelj = forcelj * switch1 + philj * switch2;
|
forcelj = forcelj * switch1 + philj * switch2;
|
||||||
}
|
}
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (sbindex) {
|
if (sbindex) {
|
||||||
const flt_t factor_lj = special_lj[sbindex];
|
const flt_t factor_lj = special_lj[sbindex];
|
||||||
forcelj *= factor_lj;
|
forcelj *= factor_lj;
|
||||||
if (EFLAG) evdwl *= factor_lj;
|
if (EFLAG) evdwl *= factor_lj;
|
||||||
}
|
}
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const flt_t fpair = (forcecoul + forcelj) * r2inv;
|
const flt_t fpair = (forcecoul + forcelj) * r2inv;
|
||||||
const flt_t fpx = fpair * tdelx[jj];
|
const flt_t fpx = fpair * tdelx[jj];
|
||||||
fxtmp += fpx;
|
fxtmp += fpx;
|
||||||
if (NEWTON_PAIR) f[j].x -= fpx;
|
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||||
const flt_t fpy = fpair * tdely[jj];
|
const flt_t fpy = fpair * tdely[jj];
|
||||||
fytmp += fpy;
|
fytmp += fpy;
|
||||||
if (NEWTON_PAIR) f[j].y -= fpy;
|
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||||
const flt_t fpz = fpair * tdelz[jj];
|
const flt_t fpz = fpair * tdelz[jj];
|
||||||
fztmp += fpz;
|
fztmp += fpz;
|
||||||
if (NEWTON_PAIR) f[j].z -= fpz;
|
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
sevdwl += evdwl;
|
sevdwl += evdwl;
|
||||||
secoul += ecoul;
|
secoul += ecoul;
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||||
if (NEWTON_PAIR)
|
if (NEWTON_PAIR)
|
||||||
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
|
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
|
||||||
fpx, fpy, fpz);
|
fpx, fpy, fpz);
|
||||||
} // for jj
|
} // for jj
|
||||||
if (NEWTON_PAIR) {
|
if (NEWTON_PAIR) {
|
||||||
f[i].x += fxtmp;
|
f[i].x += fxtmp;
|
||||||
@ -449,33 +449,33 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
f[i].y = fytmp;
|
f[i].y = fytmp;
|
||||||
f[i].z = fztmp;
|
f[i].z = fztmp;
|
||||||
}
|
}
|
||||||
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||||
} // for ii
|
} // for ii
|
||||||
|
|
||||||
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||||
ov4, ov5);
|
ov4, ov5);
|
||||||
} // end of omp parallel region
|
} // end of omp parallel region
|
||||||
|
|
||||||
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
if (NEWTON_PAIR == 0) {
|
if (NEWTON_PAIR == 0) {
|
||||||
oevdwl *= (acc_t)0.5;
|
oevdwl *= (acc_t)0.5;
|
||||||
oecoul *= (acc_t)0.5;
|
oecoul *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
ev_global[0] = oevdwl;
|
ev_global[0] = oevdwl;
|
||||||
ev_global[1] = oecoul;
|
ev_global[1] = oecoul;
|
||||||
}
|
}
|
||||||
if (vflag) {
|
if (vflag) {
|
||||||
if (NEWTON_PAIR == 0) {
|
if (NEWTON_PAIR == 0) {
|
||||||
ov0 *= (acc_t)0.5;
|
ov0 *= (acc_t)0.5;
|
||||||
ov1 *= (acc_t)0.5;
|
ov1 *= (acc_t)0.5;
|
||||||
ov2 *= (acc_t)0.5;
|
ov2 *= (acc_t)0.5;
|
||||||
ov3 *= (acc_t)0.5;
|
ov3 *= (acc_t)0.5;
|
||||||
ov4 *= (acc_t)0.5;
|
ov4 *= (acc_t)0.5;
|
||||||
ov5 *= (acc_t)0.5;
|
ov5 *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
ev_global[2] = ov0;
|
ev_global[2] = ov0;
|
||||||
ev_global[3] = ov1;
|
ev_global[3] = ov1;
|
||||||
@ -556,7 +556,7 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
double cut, cutneigh;
|
double cut, cutneigh;
|
||||||
if (cut_lj > cut_coul)
|
if (cut_lj > cut_coul)
|
||||||
error->all(FLERR,
|
error->all(FLERR,
|
||||||
"Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
|
"Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
|
||||||
for (int i = 1; i <= atom->ntypes; i++) {
|
for (int i = 1; i <= atom->ntypes; i++) {
|
||||||
for (int j = i; j <= atom->ntypes; j++) {
|
for (int j = i; j <= atom->ntypes; j++) {
|
||||||
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
|
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
|
||||||
@ -637,7 +637,7 @@ template <class flt_t>
|
|||||||
void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
||||||
const int ntable,
|
const int ntable,
|
||||||
Memory *memory,
|
Memory *memory,
|
||||||
const int cop) {
|
const int cop) {
|
||||||
if ( (ntypes != _ntypes || ntable != _ntable) ) {
|
if ( (ntypes != _ntypes || ntable != _ntable) ) {
|
||||||
if (_ntypes > 0) {
|
if (_ntypes > 0) {
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
@ -653,12 +653,12 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|||||||
if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
|
if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
|
||||||
otable != NULL && oetable != NULL && odetable != NULL &&
|
otable != NULL && oetable != NULL && odetable != NULL &&
|
||||||
octable != NULL && odctable != NULL && ospecial_coul != NULL &&
|
octable != NULL && odctable != NULL && ospecial_coul != NULL &&
|
||||||
cop >= 0) {
|
cop >= 0) {
|
||||||
#pragma offload_transfer target(mic:cop) \
|
#pragma offload_transfer target(mic:cop) \
|
||||||
nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
|
nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
|
||||||
nocopy(ocutsq, olj: alloc_if(0) free_if(1)) \
|
nocopy(ocutsq, olj: alloc_if(0) free_if(1)) \
|
||||||
nocopy(otable: alloc_if(0) free_if(1)) \
|
nocopy(otable: alloc_if(0) free_if(1)) \
|
||||||
nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
|
nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -694,7 +694,7 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|||||||
if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
|
if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
|
||||||
otable !=NULL && oetable != NULL && odetable != NULL &&
|
otable !=NULL && oetable != NULL && odetable != NULL &&
|
||||||
octable != NULL && odctable != NULL && ospecial_coul != NULL &&
|
octable != NULL && odctable != NULL && ospecial_coul != NULL &&
|
||||||
cop >= 0) {
|
cop >= 0) {
|
||||||
#pragma offload_transfer target(mic:cop) \
|
#pragma offload_transfer target(mic:cop) \
|
||||||
nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
|
nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
|
||||||
nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
|
nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
|
||||||
|
|||||||
@ -50,8 +50,8 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
|
|||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
@ -75,7 +75,7 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
|
|||||||
~ForceConst() { set_ntypes(0,0,NULL,_cop); }
|
~ForceConst() { set_ntypes(0,0,NULL,_cop); }
|
||||||
|
|
||||||
void set_ntypes(const int ntypes, const int ntable, Memory *memory,
|
void set_ntypes(const int ntypes, const int ntable, Memory *memory,
|
||||||
const int cop);
|
const int cop);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int _ntypes, _ntable, _cop;
|
int _ntypes, _ntable, _cop;
|
||||||
|
|||||||
@ -68,8 +68,8 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag)
|
|||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
|
void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
if (eflag || vflag) {
|
if (eflag || vflag) {
|
||||||
ev_setup(eflag,vflag);
|
ev_setup(eflag,vflag);
|
||||||
@ -92,7 +92,7 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
|
|||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||||
packthreads, sizeof(ATOM_T));
|
packthreads, sizeof(ATOM_T));
|
||||||
buffers->thr_pack(ifrom,ito,ago);
|
buffers->thr_pack(ifrom,ito,ago);
|
||||||
}
|
}
|
||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
@ -124,9 +124,9 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
const int astart, const int aend)
|
const int astart, const int aend)
|
||||||
{
|
{
|
||||||
const int inum = aend - astart;
|
const int inum = aend - astart;
|
||||||
if (inum == 0) return;
|
if (inum == 0) return;
|
||||||
@ -171,8 +171,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
int tc;
|
int tc;
|
||||||
FORCE_T * _noalias f_start;
|
FORCE_T * _noalias f_start;
|
||||||
@ -208,7 +208,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
|
in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
|
||||||
in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
|
in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
|
||||||
in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
|
in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
|
||||||
in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \
|
in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \
|
||||||
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
|
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
|
||||||
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
|
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
|
||||||
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
||||||
@ -220,7 +220,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
||||||
f_stride, x, q);
|
f_stride, x, q);
|
||||||
|
|
||||||
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
||||||
@ -261,18 +261,18 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
const int jnum = numneigh[i];
|
const int jnum = numneigh[i];
|
||||||
|
|
||||||
acc_t fxtmp,fytmp,fztmp,fwtmp;
|
acc_t fxtmp,fytmp,fztmp,fwtmp;
|
||||||
acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
|
acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
|
|
||||||
const flt_t xtmp = x[i].x;
|
const flt_t xtmp = x[i].x;
|
||||||
const flt_t ytmp = x[i].y;
|
const flt_t ytmp = x[i].y;
|
||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
const flt_t qtmp = q[i];
|
const flt_t qtmp = q[i];
|
||||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
|
|
||||||
int ej = 0;
|
int ej = 0;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma ivdep
|
#pragma ivdep
|
||||||
@ -282,91 +282,91 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
const flt_t delx = xtmp - x[j].x;
|
const flt_t delx = xtmp - x[j].x;
|
||||||
const flt_t dely = ytmp - x[j].y;
|
const flt_t dely = ytmp - x[j].y;
|
||||||
const flt_t delz = ztmp - x[j].z;
|
const flt_t delz = ztmp - x[j].z;
|
||||||
const int jtype = x[j].w;
|
const int jtype = x[j].w;
|
||||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||||
|
|
||||||
if (rsq < c_forcei[jtype].cutsq) {
|
if (rsq < c_forcei[jtype].cutsq) {
|
||||||
trsq[ej]=rsq;
|
trsq[ej]=rsq;
|
||||||
tdelx[ej]=delx;
|
tdelx[ej]=delx;
|
||||||
tdely[ej]=dely;
|
tdely[ej]=dely;
|
||||||
tdelz[ej]=delz;
|
tdelz[ej]=delz;
|
||||||
tjtype[ej]=jtype;
|
tjtype[ej]=jtype;
|
||||||
tj[ej]=jlist[jj];
|
tj[ej]=jlist[jj];
|
||||||
ej++;
|
ej++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < ej; jj++) {
|
for (int jj = 0; jj < ej; jj++) {
|
||||||
flt_t forcecoul, forcelj, evdwl, ecoul;
|
flt_t forcecoul, forcelj, evdwl, ecoul;
|
||||||
forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
|
forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
|
||||||
|
|
||||||
const int j = tj[jj] & NEIGHMASK;
|
const int j = tj[jj] & NEIGHMASK;
|
||||||
const int sbindex = tj[jj] >> SBBITS & 3;
|
const int sbindex = tj[jj] >> SBBITS & 3;
|
||||||
const int jtype = tjtype[jj];
|
const int jtype = tjtype[jj];
|
||||||
const flt_t rsq = trsq[jj];
|
const flt_t rsq = trsq[jj];
|
||||||
const flt_t r2inv = (flt_t)1.0 / rsq;
|
const flt_t r2inv = (flt_t)1.0 / rsq;
|
||||||
|
|
||||||
#ifdef INTEL_ALLOW_TABLE
|
#ifdef INTEL_ALLOW_TABLE
|
||||||
if (!ncoultablebits || rsq <= tabinnersq) {
|
if (!ncoultablebits || rsq <= tabinnersq) {
|
||||||
#endif
|
#endif
|
||||||
const flt_t A1 = 0.254829592;
|
const flt_t A1 = 0.254829592;
|
||||||
const flt_t A2 = -0.284496736;
|
const flt_t A2 = -0.284496736;
|
||||||
const flt_t A3 = 1.421413741;
|
const flt_t A3 = 1.421413741;
|
||||||
const flt_t A4 = -1.453152027;
|
const flt_t A4 = -1.453152027;
|
||||||
const flt_t A5 = 1.061405429;
|
const flt_t A5 = 1.061405429;
|
||||||
const flt_t EWALD_F = 1.12837917;
|
const flt_t EWALD_F = 1.12837917;
|
||||||
const flt_t INV_EWALD_P = 1.0 / 0.3275911;
|
const flt_t INV_EWALD_P = 1.0 / 0.3275911;
|
||||||
|
|
||||||
const flt_t r = (flt_t)1.0 / sqrt(r2inv);
|
const flt_t r = (flt_t)1.0 / sqrt(r2inv);
|
||||||
const flt_t grij = g_ewald * r;
|
const flt_t grij = g_ewald * r;
|
||||||
const flt_t expm2 = exp(-grij * grij);
|
const flt_t expm2 = exp(-grij * grij);
|
||||||
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
|
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
|
||||||
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||||
const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
|
const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
|
||||||
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||||
if (EFLAG) ecoul = prefactor * erfc;
|
if (EFLAG) ecoul = prefactor * erfc;
|
||||||
|
|
||||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
|
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
|
||||||
prefactor;
|
prefactor;
|
||||||
forcecoul -= adjust;
|
forcecoul -= adjust;
|
||||||
if (EFLAG) ecoul -= adjust;
|
if (EFLAG) ecoul -= adjust;
|
||||||
|
|
||||||
#ifdef INTEL_ALLOW_TABLE
|
#ifdef INTEL_ALLOW_TABLE
|
||||||
} else {
|
} else {
|
||||||
float rsq_lookup = rsq;
|
float rsq_lookup = rsq;
|
||||||
const int itable = (__intel_castf32_u32(rsq_lookup) &
|
const int itable = (__intel_castf32_u32(rsq_lookup) &
|
||||||
ncoulmask) >> ncoulshiftbits;
|
ncoulmask) >> ncoulshiftbits;
|
||||||
const flt_t fraction = (rsq_lookup - table[itable].r) *
|
const flt_t fraction = (rsq_lookup - table[itable].r) *
|
||||||
table[itable].dr;
|
table[itable].dr;
|
||||||
|
|
||||||
const flt_t tablet = table[itable].f +
|
const flt_t tablet = table[itable].f +
|
||||||
fraction * table[itable].df;
|
fraction * table[itable].df;
|
||||||
forcecoul = qtmp * q[j] * tablet;
|
forcecoul = qtmp * q[j] * tablet;
|
||||||
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
|
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
|
||||||
fraction * detable[itable]);
|
fraction * detable[itable]);
|
||||||
if (sbindex) {
|
if (sbindex) {
|
||||||
const flt_t table2 = ctable[itable] +
|
const flt_t table2 = ctable[itable] +
|
||||||
fraction * dctable[itable];
|
fraction * dctable[itable];
|
||||||
const flt_t prefactor = qtmp * q[j] * table2;
|
const flt_t prefactor = qtmp * q[j] * table2;
|
||||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
|
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
|
||||||
prefactor;
|
prefactor;
|
||||||
forcecoul -= adjust;
|
forcecoul -= adjust;
|
||||||
if (EFLAG) ecoul -= adjust;
|
if (EFLAG) ecoul -= adjust;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
if (rsq < c_forcei[jtype].cut_ljsq) {
|
if (rsq < c_forcei[jtype].cut_ljsq) {
|
||||||
#endif
|
#endif
|
||||||
flt_t r6inv = r2inv * r2inv * r2inv;
|
flt_t r6inv = r2inv * r2inv * r2inv;
|
||||||
forcelj = r6inv * (c_forcei[jtype].lj1 * r6inv -
|
forcelj = r6inv * (c_forcei[jtype].lj1 * r6inv -
|
||||||
c_forcei[jtype].lj2);
|
c_forcei[jtype].lj2);
|
||||||
if (EFLAG) evdwl = r6inv*(c_energyi[jtype].lj3 * r6inv -
|
if (EFLAG) evdwl = r6inv*(c_energyi[jtype].lj3 * r6inv -
|
||||||
c_energyi[jtype].lj4) -
|
c_energyi[jtype].lj4) -
|
||||||
c_energyi[jtype].offset;
|
c_energyi[jtype].offset;
|
||||||
@ -376,14 +376,14 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
forcelj *= factor_lj;
|
forcelj *= factor_lj;
|
||||||
if (EFLAG) evdwl *= factor_lj;
|
if (EFLAG) evdwl *= factor_lj;
|
||||||
}
|
}
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (rsq > c_forcei[jtype].cut_ljsq)
|
if (rsq > c_forcei[jtype].cut_ljsq)
|
||||||
{ forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
{ forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const flt_t fpair = (forcecoul + forcelj) * r2inv;
|
const flt_t fpair = (forcecoul + forcelj) * r2inv;
|
||||||
const flt_t fpx = fpair * tdelx[jj];
|
const flt_t fpx = fpair * tdelx[jj];
|
||||||
fxtmp += fpx;
|
fxtmp += fpx;
|
||||||
if (NEWTON_PAIR) f[j].x -= fpx;
|
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||||
@ -394,58 +394,58 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
fztmp += fpz;
|
fztmp += fpz;
|
||||||
if (NEWTON_PAIR) f[j].z -= fpz;
|
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
sevdwl += evdwl;
|
sevdwl += evdwl;
|
||||||
secoul += ecoul;
|
secoul += ecoul;
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||||
if (NEWTON_PAIR)
|
if (NEWTON_PAIR)
|
||||||
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
|
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
|
||||||
fpx, fpy, fpz);
|
fpx, fpy, fpz);
|
||||||
} // for jj
|
} // for jj
|
||||||
|
|
||||||
if (NEWTON_PAIR) {
|
if (NEWTON_PAIR) {
|
||||||
f[i].x += fxtmp;
|
f[i].x += fxtmp;
|
||||||
f[i].y += fytmp;
|
f[i].y += fytmp;
|
||||||
f[i].z += fztmp;
|
f[i].z += fztmp;
|
||||||
} else {
|
} else {
|
||||||
f[i].x = fxtmp;
|
f[i].x = fxtmp;
|
||||||
f[i].y = fytmp;
|
f[i].y = fytmp;
|
||||||
f[i].z = fztmp;
|
f[i].z = fztmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||||
} // for ii
|
} // for ii
|
||||||
|
|
||||||
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||||
ov4, ov5);
|
ov4, ov5);
|
||||||
} // end of omp parallel region
|
} // end of omp parallel region
|
||||||
|
|
||||||
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
if (NEWTON_PAIR == 0) {
|
if (NEWTON_PAIR == 0) {
|
||||||
oevdwl *= (acc_t)0.5;
|
oevdwl *= (acc_t)0.5;
|
||||||
oecoul *= (acc_t)0.5;
|
oecoul *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
ev_global[0] = oevdwl;
|
ev_global[0] = oevdwl;
|
||||||
ev_global[1] = oecoul;
|
ev_global[1] = oecoul;
|
||||||
}
|
}
|
||||||
if (vflag) {
|
if (vflag) {
|
||||||
if (NEWTON_PAIR == 0) {
|
if (NEWTON_PAIR == 0) {
|
||||||
ov0 *= (acc_t)0.5;
|
ov0 *= (acc_t)0.5;
|
||||||
ov1 *= (acc_t)0.5;
|
ov1 *= (acc_t)0.5;
|
||||||
ov2 *= (acc_t)0.5;
|
ov2 *= (acc_t)0.5;
|
||||||
ov3 *= (acc_t)0.5;
|
ov3 *= (acc_t)0.5;
|
||||||
ov4 *= (acc_t)0.5;
|
ov4 *= (acc_t)0.5;
|
||||||
ov5 *= (acc_t)0.5;
|
ov5 *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
ev_global[2] = ov0;
|
ev_global[2] = ov0;
|
||||||
ev_global[3] = ov1;
|
ev_global[3] = ov1;
|
||||||
ev_global[4] = ov2;
|
ev_global[4] = ov2;
|
||||||
@ -547,8 +547,8 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
for (int i = 0; i < tp1; i++) {
|
for (int i = 0; i < tp1; i++) {
|
||||||
for (int j = 0; j < tp1; j++) {
|
for (int j = 0; j < tp1; j++) {
|
||||||
if (cutsq[i][j] < cut_ljsq[i][j])
|
if (cutsq[i][j] < cut_ljsq[i][j])
|
||||||
error->all(FLERR,
|
error->all(FLERR,
|
||||||
"Intel variant of lj/cut/coul/long expects lj cutoff<=coulombic");
|
"Intel variant of lj/cut/coul/long expects lj cutoff<=coulombic");
|
||||||
fc.c_force[i][j].cutsq = cutsq[i][j];
|
fc.c_force[i][j].cutsq = cutsq[i][j];
|
||||||
fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
|
fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
|
||||||
fc.c_force[i][j].lj1 = lj1[i][j];
|
fc.c_force[i][j].lj1 = lj1[i][j];
|
||||||
@ -598,9 +598,9 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
|
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
void PairLJCutCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
void PairLJCutCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
||||||
const int ntable,
|
const int ntable,
|
||||||
Memory *memory,
|
Memory *memory,
|
||||||
const int cop) {
|
const int cop) {
|
||||||
if ( (ntypes != _ntypes || ntable != _ntable) ) {
|
if ( (ntypes != _ntypes || ntable != _ntable) ) {
|
||||||
if (_ntypes > 0) {
|
if (_ntypes > 0) {
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
@ -619,9 +619,9 @@ void PairLJCutCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|||||||
ospecial_coul != NULL && _cop >= 0) {
|
ospecial_coul != NULL && _cop >= 0) {
|
||||||
#pragma offload_transfer target(mic:cop) \
|
#pragma offload_transfer target(mic:cop) \
|
||||||
nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
|
nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
|
||||||
nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
|
nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
|
||||||
nocopy(otable: alloc_if(0) free_if(1)) \
|
nocopy(otable: alloc_if(0) free_if(1)) \
|
||||||
nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
|
nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@ -50,8 +50,8 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong {
|
|||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
@ -76,7 +76,7 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong {
|
|||||||
~ForceConst() { set_ntypes(0,0,NULL,_cop); }
|
~ForceConst() { set_ntypes(0,0,NULL,_cop); }
|
||||||
|
|
||||||
void set_ntypes(const int ntypes, const int ntable, Memory *memory,
|
void set_ntypes(const int ntypes, const int ntable, Memory *memory,
|
||||||
const int cop);
|
const int cop);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int _ntypes, _ntable, _cop;
|
int _ntypes, _ntable, _cop;
|
||||||
|
|||||||
@ -96,37 +96,37 @@ void PairLJCutIntel::compute(int eflag, int vflag,
|
|||||||
if (_onetype) {
|
if (_onetype) {
|
||||||
if (eflag) {
|
if (eflag) {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (eflag) {
|
if (eflag) {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -161,8 +161,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
int tc;
|
int tc;
|
||||||
FORCE_T * _noalias f_start;
|
FORCE_T * _noalias f_start;
|
||||||
@ -176,7 +176,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
||||||
f_stride, x, 0);
|
f_stride, x, 0);
|
||||||
|
|
||||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EFLAG) oevdwl = (acc_t)0;
|
if (EFLAG) oevdwl = (acc_t)0;
|
||||||
@ -200,23 +200,23 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
flt_t cutsq, lj1, lj2, lj3, lj4, offset;
|
flt_t cutsq, lj1, lj2, lj3, lj4, offset;
|
||||||
if (ONETYPE) {
|
if (ONETYPE) {
|
||||||
cutsq = ljc12o[3].cutsq;
|
cutsq = ljc12o[3].cutsq;
|
||||||
lj1 = ljc12o[3].lj1;
|
lj1 = ljc12o[3].lj1;
|
||||||
lj2 = ljc12o[3].lj2;
|
lj2 = ljc12o[3].lj2;
|
||||||
lj3 = lj34[3].lj3;
|
lj3 = lj34[3].lj3;
|
||||||
lj4 = lj34[3].lj4;
|
lj4 = lj34[3].lj4;
|
||||||
offset = ljc12o[3].offset;
|
offset = ljc12o[3].offset;
|
||||||
}
|
}
|
||||||
for (int i = iifrom; i < iito; i += iip) {
|
for (int i = iifrom; i < iito; i += iip) {
|
||||||
int itype, ptr_off;
|
int itype, ptr_off;
|
||||||
const FC_PACKED1_T * _noalias ljc12oi;
|
const FC_PACKED1_T * _noalias ljc12oi;
|
||||||
const FC_PACKED2_T * _noalias lj34i;
|
const FC_PACKED2_T * _noalias lj34i;
|
||||||
if (!ONETYPE) {
|
if (!ONETYPE) {
|
||||||
itype = x[i].w;
|
itype = x[i].w;
|
||||||
ptr_off = itype * ntypes;
|
ptr_off = itype * ntypes;
|
||||||
ljc12oi = ljc12o + ptr_off;
|
ljc12oi = ljc12o + ptr_off;
|
||||||
lj34i = lj34 + ptr_off;
|
lj34i = lj34 + ptr_off;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int * _noalias const jlist = firstneigh + cnumneigh[i];
|
const int * _noalias const jlist = firstneigh + cnumneigh[i];
|
||||||
const int jnum = numneigh[i];
|
const int jnum = numneigh[i];
|
||||||
@ -228,113 +228,113 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
const flt_t ytmp = x[i].y;
|
const flt_t ytmp = x[i].y;
|
||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
flt_t forcelj, evdwl;
|
flt_t forcelj, evdwl;
|
||||||
forcelj = evdwl = (flt_t)0.0;
|
forcelj = evdwl = (flt_t)0.0;
|
||||||
|
|
||||||
int j, jtype, sbindex;
|
int j, jtype, sbindex;
|
||||||
if (!ONETYPE) {
|
if (!ONETYPE) {
|
||||||
sbindex = jlist[jj] >> SBBITS & 3;
|
sbindex = jlist[jj] >> SBBITS & 3;
|
||||||
j = jlist[jj] & NEIGHMASK;
|
j = jlist[jj] & NEIGHMASK;
|
||||||
} else
|
} else
|
||||||
j = jlist[jj];
|
j = jlist[jj];
|
||||||
|
|
||||||
const flt_t delx = xtmp - x[j].x;
|
const flt_t delx = xtmp - x[j].x;
|
||||||
const flt_t dely = ytmp - x[j].y;
|
const flt_t dely = ytmp - x[j].y;
|
||||||
const flt_t delz = ztmp - x[j].z;
|
const flt_t delz = ztmp - x[j].z;
|
||||||
if (!ONETYPE) {
|
if (!ONETYPE) {
|
||||||
jtype = x[j].w;
|
jtype = x[j].w;
|
||||||
cutsq = ljc12oi[jtype].cutsq;
|
cutsq = ljc12oi[jtype].cutsq;
|
||||||
}
|
}
|
||||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||||
|
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
if (rsq < cutsq) {
|
if (rsq < cutsq) {
|
||||||
#endif
|
#endif
|
||||||
flt_t factor_lj;
|
flt_t factor_lj;
|
||||||
if (!ONETYPE) factor_lj = special_lj[sbindex];
|
if (!ONETYPE) factor_lj = special_lj[sbindex];
|
||||||
flt_t r2inv = 1.0 / rsq;
|
flt_t r2inv = 1.0 / rsq;
|
||||||
flt_t r6inv = r2inv * r2inv * r2inv;
|
flt_t r6inv = r2inv * r2inv * r2inv;
|
||||||
#ifndef INTEL_VMASK
|
#ifndef INTEL_VMASK
|
||||||
if (rsq > cutsq) r6inv = (flt_t)0.0;
|
if (rsq > cutsq) r6inv = (flt_t)0.0;
|
||||||
#endif
|
#endif
|
||||||
if (!ONETYPE) {
|
if (!ONETYPE) {
|
||||||
lj1 = ljc12oi[jtype].lj1;
|
lj1 = ljc12oi[jtype].lj1;
|
||||||
lj2 = ljc12oi[jtype].lj2;
|
lj2 = ljc12oi[jtype].lj2;
|
||||||
}
|
}
|
||||||
forcelj = r6inv * (lj1 * r6inv - lj2);
|
forcelj = r6inv * (lj1 * r6inv - lj2);
|
||||||
flt_t fpair;
|
flt_t fpair;
|
||||||
if (!ONETYPE)
|
if (!ONETYPE)
|
||||||
fpair = factor_lj * forcelj * r2inv;
|
fpair = factor_lj * forcelj * r2inv;
|
||||||
else
|
else
|
||||||
fpair = forcelj * r2inv;
|
fpair = forcelj * r2inv;
|
||||||
|
|
||||||
const flt_t fpx = fpair * delx;
|
const flt_t fpx = fpair * delx;
|
||||||
fxtmp += fpx;
|
fxtmp += fpx;
|
||||||
if (NEWTON_PAIR) f[j].x -= fpx;
|
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||||
const flt_t fpy = fpair * dely;
|
const flt_t fpy = fpair * dely;
|
||||||
fytmp += fpy;
|
fytmp += fpy;
|
||||||
if (NEWTON_PAIR) f[j].y -= fpy;
|
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||||
const flt_t fpz = fpair * delz;
|
const flt_t fpz = fpair * delz;
|
||||||
fztmp += fpz;
|
fztmp += fpz;
|
||||||
if (NEWTON_PAIR) f[j].z -= fpz;
|
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
if (!ONETYPE) {
|
if (!ONETYPE) {
|
||||||
lj3 = lj34i[jtype].lj3;
|
lj3 = lj34i[jtype].lj3;
|
||||||
lj4 = lj34i[jtype].lj4;
|
lj4 = lj34i[jtype].lj4;
|
||||||
offset = ljc12oi[jtype].offset;
|
offset = ljc12oi[jtype].offset;
|
||||||
}
|
}
|
||||||
evdwl = r6inv * (lj3 * r6inv - lj4);
|
evdwl = r6inv * (lj3 * r6inv - lj4);
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
evdwl -= offset;
|
evdwl -= offset;
|
||||||
#else
|
#else
|
||||||
if (rsq < cutsq) evdwl -= offset;
|
if (rsq < cutsq) evdwl -= offset;
|
||||||
#endif
|
#endif
|
||||||
if (!ONETYPE) evdwl *= factor_lj;
|
if (!ONETYPE) evdwl *= factor_lj;
|
||||||
sevdwl += evdwl;
|
sevdwl += evdwl;
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
fwtmp += (flt_t)0.5 * evdwl;
|
fwtmp += (flt_t)0.5 * evdwl;
|
||||||
if (NEWTON_PAIR)
|
if (NEWTON_PAIR)
|
||||||
f[j].w += (flt_t)0.5 * evdwl;
|
f[j].w += (flt_t)0.5 * evdwl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_PAIR == 0)
|
if (NEWTON_PAIR == 0)
|
||||||
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
|
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
} // if rsq
|
} // if rsq
|
||||||
#endif
|
#endif
|
||||||
} // for jj
|
} // for jj
|
||||||
if (NEWTON_PAIR) {
|
if (NEWTON_PAIR) {
|
||||||
f[i].x += fxtmp;
|
f[i].x += fxtmp;
|
||||||
f[i].y += fytmp;
|
f[i].y += fytmp;
|
||||||
f[i].z += fztmp;
|
f[i].z += fztmp;
|
||||||
} else {
|
} else {
|
||||||
f[i].x = fxtmp;
|
f[i].x = fxtmp;
|
||||||
f[i].y = fytmp;
|
f[i].y = fytmp;
|
||||||
f[i].z = fztmp;
|
f[i].z = fztmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||||
} // for ii
|
} // for ii
|
||||||
|
|
||||||
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||||
ov4, ov5);
|
ov4, ov5);
|
||||||
} // end omp
|
} // end omp
|
||||||
|
|
||||||
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
||||||
@ -343,12 +343,12 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
if (vflag) {
|
if (vflag) {
|
||||||
if (NEWTON_PAIR == 0) {
|
if (NEWTON_PAIR == 0) {
|
||||||
ov0 *= (acc_t)0.5;
|
ov0 *= (acc_t)0.5;
|
||||||
ov1 *= (acc_t)0.5;
|
ov1 *= (acc_t)0.5;
|
||||||
ov2 *= (acc_t)0.5;
|
ov2 *= (acc_t)0.5;
|
||||||
ov3 *= (acc_t)0.5;
|
ov3 *= (acc_t)0.5;
|
||||||
ov4 *= (acc_t)0.5;
|
ov4 *= (acc_t)0.5;
|
||||||
ov5 *= (acc_t)0.5;
|
ov5 *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
ev_global[2] = ov0;
|
ev_global[2] = ov0;
|
||||||
ev_global[3] = ov1;
|
ev_global[3] = ov1;
|
||||||
@ -454,7 +454,7 @@ void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
void PairLJCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
void PairLJCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
||||||
Memory *memory,
|
Memory *memory,
|
||||||
const int cop) {
|
const int cop) {
|
||||||
if (ntypes != _ntypes) {
|
if (ntypes != _ntypes) {
|
||||||
if (_ntypes > 0) {
|
if (_ntypes > 0) {
|
||||||
fc_packed1 *oljc12o = ljc12o[0];
|
fc_packed1 *oljc12o = ljc12o[0];
|
||||||
|
|||||||
@ -1,50 +1,50 @@
|
|||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||||
http://lammps.sandia.gov, Sandia National Laboratories
|
http://lammps.sandia.gov, Sandia National Laboratories
|
||||||
Steve Plimpton, sjplimp@sandia.gov
|
Steve Plimpton, sjplimp@sandia.gov
|
||||||
|
|
||||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||||
certain rights in this software. This software is distributed under
|
certain rights in this software. This software is distributed under
|
||||||
the GNU General Public License.
|
the GNU General Public License.
|
||||||
|
|
||||||
See the README file in the top-level LAMMPS directory.
|
See the README file in the top-level LAMMPS directory.
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
Contributing authors: William McDoniel (RWTH Aachen University)
|
Contributing authors: William McDoniel (RWTH Aachen University)
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include "pair_lj_long_coul_long_intel.h"
|
#include "pair_lj_long_coul_long_intel.h"
|
||||||
#include "atom.h"
|
#include "atom.h"
|
||||||
#include "comm.h"
|
#include "comm.h"
|
||||||
#include "force.h"
|
#include "force.h"
|
||||||
#include "group.h"
|
#include "group.h"
|
||||||
#include "kspace.h"
|
#include "kspace.h"
|
||||||
#include "memory.h"
|
#include "memory.h"
|
||||||
#include "neighbor.h"
|
#include "neighbor.h"
|
||||||
#include "neigh_list.h"
|
#include "neigh_list.h"
|
||||||
#include "neigh_request.h"
|
#include "neigh_request.h"
|
||||||
#include "memory.h"
|
#include "memory.h"
|
||||||
#include "suffix.h"
|
#include "suffix.h"
|
||||||
|
|
||||||
|
|
||||||
using namespace LAMMPS_NS;
|
using namespace LAMMPS_NS;
|
||||||
|
|
||||||
#define C_FORCE_T typename ForceConst<flt_t>::c_force_t
|
#define C_FORCE_T typename ForceConst<flt_t>::c_force_t
|
||||||
#define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
|
#define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
|
||||||
#define TABLE_T typename ForceConst<flt_t>::table_t
|
#define TABLE_T typename ForceConst<flt_t>::table_t
|
||||||
|
|
||||||
PairLJLongCoulLongIntel::PairLJLongCoulLongIntel(LAMMPS *lmp) :
|
PairLJLongCoulLongIntel::PairLJLongCoulLongIntel(LAMMPS *lmp) :
|
||||||
PairLJLongCoulLong(lmp)
|
PairLJLongCoulLong(lmp)
|
||||||
{
|
{
|
||||||
suffix_flag |= Suffix::INTEL;
|
suffix_flag |= Suffix::INTEL;
|
||||||
respa_enable = 0;
|
respa_enable = 0;
|
||||||
cut_respa = NULL;
|
cut_respa = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
PairLJLongCoulLongIntel::~PairLJLongCoulLongIntel()
|
PairLJLongCoulLongIntel::~PairLJLongCoulLongIntel()
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,39 +1,39 @@
|
|||||||
/* *- c++ -*- -----------------------------------------------------------
|
/* *- c++ -*- -----------------------------------------------------------
|
||||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||||
http://lammps.sandia.gov, Sandia National Laboratories
|
http://lammps.sandia.gov, Sandia National Laboratories
|
||||||
Steve Plimpton, sjplimp@sandia.gov
|
Steve Plimpton, sjplimp@sandia.gov
|
||||||
|
|
||||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||||
certain rights in this software. This software is distributed under
|
certain rights in this software. This software is distributed under
|
||||||
the GNU General Public License.
|
the GNU General Public License.
|
||||||
|
|
||||||
See the README file in the top-level LAMMPS directory.
|
See the README file in the top-level LAMMPS directory.
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
Contributing authors: William McDoniel (RWTH Aachen University)
|
Contributing authors: William McDoniel (RWTH Aachen University)
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#ifdef PAIR_CLASS
|
#ifdef PAIR_CLASS
|
||||||
|
|
||||||
PairStyle(lj/long/coul/long/intel,PairLJLongCoulLongIntel)
|
PairStyle(lj/long/coul/long/intel,PairLJLongCoulLongIntel)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifndef LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
|
#ifndef LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
|
||||||
#define LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
|
#define LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
|
||||||
|
|
||||||
#include "pair_lj_long_coul_long.h"
|
#include "pair_lj_long_coul_long.h"
|
||||||
#include "fix_intel.h"
|
#include "fix_intel.h"
|
||||||
|
|
||||||
namespace LAMMPS_NS {
|
namespace LAMMPS_NS {
|
||||||
class PairLJLongCoulLongIntel : public PairLJLongCoulLong {
|
class PairLJLongCoulLongIntel : public PairLJLongCoulLong {
|
||||||
public:
|
public:
|
||||||
PairLJLongCoulLongIntel(class LAMMPS *);
|
PairLJLongCoulLongIntel(class LAMMPS *);
|
||||||
virtual ~PairLJLongCoulLongIntel();
|
virtual ~PairLJLongCoulLongIntel();
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -49,7 +49,7 @@ class PairSWIntel : public PairSW {
|
|||||||
template <int SPQ, int ONETYPE, int EFLAG, class flt_t, class acc_t>
|
template <int SPQ, int ONETYPE, int EFLAG, class flt_t, class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
|
IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
|
||||||
const int astart, const int aend, const int pad_width);
|
const int astart, const int aend, const int pad_width);
|
||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
|
|||||||
@ -47,7 +47,7 @@ void PairTersoffIntel::init_style()
|
|||||||
{
|
{
|
||||||
if (comm->me == 0) {
|
if (comm->me == 0) {
|
||||||
error->warning(FLERR, "Tersoff/intel currently requires intel compiler. "
|
error->warning(FLERR, "Tersoff/intel currently requires intel compiler. "
|
||||||
"Using MANYBODY version.");
|
"Using MANYBODY version.");
|
||||||
}
|
}
|
||||||
PairTersoff::init_style();
|
PairTersoff::init_style();
|
||||||
}
|
}
|
||||||
@ -87,7 +87,7 @@ PairTersoffIntel::PairTersoffIntel(LAMMPS *lmp) : PairTersoff(lmp)
|
|||||||
void PairTersoffIntel::compute(int eflag, int vflag)
|
void PairTersoffIntel::compute(int eflag, int vflag)
|
||||||
{
|
{
|
||||||
if (fix->precision()==FixIntel::PREC_MODE_MIXED) {
|
if (fix->precision()==FixIntel::PREC_MODE_MIXED) {
|
||||||
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
||||||
force_const_single);
|
force_const_single);
|
||||||
} else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) {
|
} else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) {
|
||||||
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
||||||
@ -104,8 +104,8 @@ void PairTersoffIntel::compute(int eflag, int vflag)
|
|||||||
// do we need to calculate energy/virial
|
// do we need to calculate energy/virial
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void PairTersoffIntel::compute(int eflag, int vflag,
|
void PairTersoffIntel::compute(int eflag, int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
if (eflag || vflag) {
|
if (eflag || vflag) {
|
||||||
ev_setup(eflag,vflag);
|
ev_setup(eflag,vflag);
|
||||||
@ -127,13 +127,13 @@ void PairTersoffIntel::compute(int eflag, int vflag,
|
|||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||||
packthreads, sizeof(ATOM_T));
|
packthreads, sizeof(ATOM_T));
|
||||||
buffers->thr_pack(ifrom,ito,ago);
|
buffers->thr_pack(ifrom,ito,ago);
|
||||||
}
|
}
|
||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ovflag = 0;
|
int ovflag = 0;
|
||||||
if (vflag_fdotr) ovflag = 2;
|
if (vflag_fdotr) ovflag = 2;
|
||||||
else if (vflag) ovflag = 1;
|
else if (vflag) ovflag = 1;
|
||||||
@ -170,14 +170,14 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
|
|||||||
// what's done in here is that they are inlined and vectorized
|
// what's done in here is that they are inlined and vectorized
|
||||||
// attractive() also provides an option to compute zeta as well
|
// attractive() also provides an option to compute zeta as well
|
||||||
static fvec zeta_vector(
|
static fvec zeta_vector(
|
||||||
const c_inner_t * param,
|
const c_inner_t * param,
|
||||||
ivec xjw, bvec mask,
|
ivec xjw, bvec mask,
|
||||||
fvec vrij, fvec rsq2,
|
fvec vrij, fvec rsq2,
|
||||||
fvec vdijx, fvec vdijy, fvec vdijz,
|
fvec vdijx, fvec vdijy, fvec vdijz,
|
||||||
fvec dikx, fvec diky, fvec dikz
|
fvec dikx, fvec diky, fvec dikz
|
||||||
);
|
);
|
||||||
static void force_zeta_vector(
|
static void force_zeta_vector(
|
||||||
const c_outer_t * param,
|
const c_outer_t * param,
|
||||||
ivec xjw,
|
ivec xjw,
|
||||||
bvec mask,
|
bvec mask,
|
||||||
fvec vrijsq, fvec vzeta_ij,
|
fvec vrijsq, fvec vzeta_ij,
|
||||||
@ -202,14 +202,14 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
|
|||||||
// perform the actual computation
|
// perform the actual computation
|
||||||
template<bool EFLAG>
|
template<bool EFLAG>
|
||||||
static void kernel(
|
static void kernel(
|
||||||
int iito, int iifrom, int eatom, int vflag,
|
int iito, int iifrom, int eatom, int vflag,
|
||||||
const int * _noalias const numneigh,
|
const int * _noalias const numneigh,
|
||||||
const int * _noalias const numneighhalf,
|
const int * _noalias const numneighhalf,
|
||||||
const int * _noalias const cnumneigh,
|
const int * _noalias const cnumneigh,
|
||||||
const int * _noalias const firstneigh, int ntypes,
|
const int * _noalias const firstneigh, int ntypes,
|
||||||
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
||||||
const c_inner_t * _noalias const c_inner,
|
const c_inner_t * _noalias const c_inner,
|
||||||
const c_outer_t * _noalias const c_outer,
|
const c_outer_t * _noalias const c_outer,
|
||||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||||
acc_t *evdwl
|
acc_t *evdwl
|
||||||
);
|
);
|
||||||
@ -217,14 +217,14 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
|
|||||||
// perform one step of calculation, pass in i-j pairs of atoms (is, js)
|
// perform one step of calculation, pass in i-j pairs of atoms (is, js)
|
||||||
template<int EFLAG>
|
template<int EFLAG>
|
||||||
static void kernel_step(
|
static void kernel_step(
|
||||||
int eatom, int vflag,
|
int eatom, int vflag,
|
||||||
const int * _noalias const numneigh,
|
const int * _noalias const numneigh,
|
||||||
const int * _noalias const cnumneigh,
|
const int * _noalias const cnumneigh,
|
||||||
const int * _noalias const firstneigh,
|
const int * _noalias const firstneigh,
|
||||||
int ntypes,
|
int ntypes,
|
||||||
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
||||||
const c_inner_t * _noalias const c_inner,
|
const c_inner_t * _noalias const c_inner,
|
||||||
const c_outer_t * _noalias const c_outer,
|
const c_outer_t * _noalias const c_outer,
|
||||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||||
avec *vsevdwl, int compress_idx, iarr is, iarr js, bvec vmask_repulsive
|
avec *vsevdwl, int compress_idx, iarr is, iarr js, bvec vmask_repulsive
|
||||||
);
|
);
|
||||||
@ -233,12 +233,12 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
|
|||||||
// with fixed i and a number of js
|
// with fixed i and a number of js
|
||||||
template<int EFLAG>
|
template<int EFLAG>
|
||||||
static void kernel_step_const_i(
|
static void kernel_step_const_i(
|
||||||
int eatom, int vflag,
|
int eatom, int vflag,
|
||||||
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
||||||
const int * _noalias const firstneigh, int ntypes,
|
const int * _noalias const firstneigh, int ntypes,
|
||||||
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
||||||
const c_inner_t * _noalias const c_inner,
|
const c_inner_t * _noalias const c_inner,
|
||||||
const c_outer_t * _noalias const c_outer,
|
const c_outer_t * _noalias const c_outer,
|
||||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||||
avec *vsevdwl, int compress_idx, int i, iarr js, bvec vmask_repulsive
|
avec *vsevdwl, int compress_idx, int i, iarr js, bvec vmask_repulsive
|
||||||
);
|
);
|
||||||
@ -255,9 +255,9 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
|
|||||||
// This method is nearly identical to what happens in the other /intel styles
|
// This method is nearly identical to what happens in the other /intel styles
|
||||||
template <int EFLAG, class flt_t, class acc_t>
|
template <int EFLAG, class flt_t, class acc_t>
|
||||||
void PairTersoffIntel::eval(const int offload, const int vflag,
|
void PairTersoffIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
const int astart, const int aend)
|
const int astart, const int aend)
|
||||||
{
|
{
|
||||||
const int inum = aend - astart;
|
const int inum = aend - astart;
|
||||||
if (inum == 0) return;
|
if (inum == 0) return;
|
||||||
@ -289,8 +289,8 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
|
|||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, 1, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, 1, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
int tc;
|
int tc;
|
||||||
FORCE_T * _noalias f_start;
|
FORCE_T * _noalias f_start;
|
||||||
@ -326,8 +326,8 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall,
|
IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall,
|
||||||
f_stride, x, 0);
|
f_stride, x, 0);
|
||||||
|
|
||||||
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
||||||
@ -354,7 +354,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
|
|||||||
// Pick the variable i algorithm under specific conditions
|
// Pick the variable i algorithm under specific conditions
|
||||||
// do use scalar algorithm with very short vectors
|
// do use scalar algorithm with very short vectors
|
||||||
int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL;
|
int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL;
|
||||||
bool pack_i = VL >= 8 &&
|
bool pack_i = VL >= 8 &&
|
||||||
lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops;
|
lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops;
|
||||||
bool use_scalar = VL < 4;
|
bool use_scalar = VL < 4;
|
||||||
if (use_scalar) {
|
if (use_scalar) {
|
||||||
@ -364,16 +364,16 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
|
|||||||
} else {
|
} else {
|
||||||
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EFLAG>(ARGS);
|
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EFLAG>(ARGS);
|
||||||
}
|
}
|
||||||
if (EFLAG) oevdwl += sevdwl;
|
if (EFLAG) oevdwl += sevdwl;
|
||||||
}
|
}
|
||||||
|
|
||||||
IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start,
|
IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start,
|
||||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||||
ov4, ov5);
|
ov4, ov5);
|
||||||
} // end of omp parallel region
|
} // end of omp parallel region
|
||||||
|
|
||||||
IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
|
IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
|
||||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
ev_global[0] = oevdwl;
|
ev_global[0] = oevdwl;
|
||||||
@ -431,7 +431,7 @@ void PairTersoffIntel::init_style()
|
|||||||
error->all(FLERR,
|
error->all(FLERR,
|
||||||
"The 'package intel' command is required for /intel styles");
|
"The 'package intel' command is required for /intel styles");
|
||||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||||
|
|
||||||
fix->pair_init_check();
|
fix->pair_init_check();
|
||||||
fix->three_body_neighbor(1);
|
fix->three_body_neighbor(1);
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
@ -481,25 +481,25 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
for (int k = 1; k < tp1; k++) {
|
for (int k = 1; k < tp1; k++) {
|
||||||
Param * param = ¶ms[elem2param[map[i]][map[j]][map[k]]];
|
Param * param = ¶ms[elem2param[map[i]][map[j]][map[k]]];
|
||||||
fc.c_cutoff_inner[i][k][j].cutsq = static_cast<flt_t>(param->cutsq);
|
fc.c_cutoff_inner[i][k][j].cutsq = static_cast<flt_t>(param->cutsq);
|
||||||
fc.c_inner_loop[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
|
fc.c_inner_loop[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
|
||||||
fc.c_inner_loop[i][j][k].bigr = static_cast<flt_t>(param->bigr);
|
fc.c_inner_loop[i][j][k].bigr = static_cast<flt_t>(param->bigr);
|
||||||
fc.c_inner_loop[i][j][k].bigd = static_cast<flt_t>(param->bigd);
|
fc.c_inner_loop[i][j][k].bigd = static_cast<flt_t>(param->bigd);
|
||||||
fc.c_inner_loop[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
|
fc.c_inner_loop[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
|
||||||
fc.c_inner_loop[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
|
fc.c_inner_loop[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
|
||||||
fc.c_inner_loop[i][j][k].h = static_cast<flt_t>(param->h);
|
fc.c_inner_loop[i][j][k].h = static_cast<flt_t>(param->h);
|
||||||
fc.c_inner_loop[i][j][k].gamma = static_cast<flt_t>(param->gamma);
|
fc.c_inner_loop[i][j][k].gamma = static_cast<flt_t>(param->gamma);
|
||||||
fc.c_inner_loop[i][j][k].powermint = static_cast<flt_t>(param->powermint);
|
fc.c_inner_loop[i][j][k].powermint = static_cast<flt_t>(param->powermint);
|
||||||
|
|
||||||
fc.c_inner[i][j][k].cutsq = static_cast<flt_t>(param->cutsq);
|
fc.c_inner[i][j][k].cutsq = static_cast<flt_t>(param->cutsq);
|
||||||
fc.c_inner[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
|
fc.c_inner[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
|
||||||
fc.c_inner[i][j][k].bigr = static_cast<flt_t>(param->bigr);
|
fc.c_inner[i][j][k].bigr = static_cast<flt_t>(param->bigr);
|
||||||
fc.c_inner[i][j][k].bigd = static_cast<flt_t>(param->bigd);
|
fc.c_inner[i][j][k].bigd = static_cast<flt_t>(param->bigd);
|
||||||
fc.c_inner[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
|
fc.c_inner[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
|
||||||
fc.c_inner[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
|
fc.c_inner[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
|
||||||
fc.c_inner[i][j][k].h = static_cast<flt_t>(param->h);
|
fc.c_inner[i][j][k].h = static_cast<flt_t>(param->h);
|
||||||
fc.c_inner[i][j][k].gamma = static_cast<flt_t>(param->gamma);
|
fc.c_inner[i][j][k].gamma = static_cast<flt_t>(param->gamma);
|
||||||
fc.c_inner[i][j][k].powermint = static_cast<flt_t>(param->powermint);
|
fc.c_inner[i][j][k].powermint = static_cast<flt_t>(param->powermint);
|
||||||
|
|
||||||
}
|
}
|
||||||
Param * param = ¶ms[elem2param[map[i]][map[j]][map[j]]];
|
Param * param = ¶ms[elem2param[map[i]][map[j]][map[j]]];
|
||||||
fc.c_cutoff_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
|
fc.c_cutoff_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
|
||||||
@ -515,7 +515,7 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
fc.c_second_loop[i][j].c2 = static_cast<flt_t>(param->c2);
|
fc.c_second_loop[i][j].c2 = static_cast<flt_t>(param->c2);
|
||||||
fc.c_second_loop[i][j].c3 = static_cast<flt_t>(param->c3);
|
fc.c_second_loop[i][j].c3 = static_cast<flt_t>(param->c3);
|
||||||
fc.c_second_loop[i][j].c4 = static_cast<flt_t>(param->c4);
|
fc.c_second_loop[i][j].c4 = static_cast<flt_t>(param->c4);
|
||||||
|
|
||||||
fc.c_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
|
fc.c_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
|
||||||
fc.c_outer[i][j].bigr = static_cast<flt_t>(param->bigr);
|
fc.c_outer[i][j].bigr = static_cast<flt_t>(param->bigr);
|
||||||
fc.c_outer[i][j].bigd = static_cast<flt_t>(param->bigd);
|
fc.c_outer[i][j].bigd = static_cast<flt_t>(param->bigd);
|
||||||
@ -563,8 +563,8 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
// As in any other /intel pair style
|
// As in any other /intel pair style
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
||||||
Memory *memory,
|
Memory *memory,
|
||||||
const int cop) {
|
const int cop) {
|
||||||
if ( (ntypes != _ntypes) ) {
|
if ( (ntypes != _ntypes) ) {
|
||||||
if (_ntypes > 0) {
|
if (_ntypes > 0) {
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
@ -575,12 +575,12 @@ void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|||||||
c_cutoff_t * oc_cutoff_outer = c_cutoff_outer[0];
|
c_cutoff_t * oc_cutoff_outer = c_cutoff_outer[0];
|
||||||
c_inner_t * oc_inner = c_inner[0][0];
|
c_inner_t * oc_inner = c_inner[0][0];
|
||||||
c_outer_t * oc_outer = c_outer[0];
|
c_outer_t * oc_outer = c_outer[0];
|
||||||
if (c_first_loop != NULL && c_second_loop != NULL &&
|
if (c_first_loop != NULL && c_second_loop != NULL &&
|
||||||
c_inner_loop != NULL && _cop >= 0) {
|
c_inner_loop != NULL && _cop >= 0) {
|
||||||
|
|
||||||
#pragma offload_transfer target(mic:cop) \
|
#pragma offload_transfer target(mic:cop) \
|
||||||
nocopy(oc_first_loop, oc_second_loop, oc_inner_loop: alloc_if(0) free_if(1)) \
|
nocopy(oc_first_loop, oc_second_loop, oc_inner_loop: alloc_if(0) free_if(1)) \
|
||||||
nocopy(oc_cutoff_outer, oc_cutoff_inner: alloc_if(0) free_if(1)) \
|
nocopy(oc_cutoff_outer, oc_cutoff_inner: alloc_if(0) free_if(1)) \
|
||||||
nocopy(oc_inner, oc_outer: alloc_if(0) free_if(0))
|
nocopy(oc_inner, oc_outer: alloc_if(0) free_if(0))
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -614,7 +614,7 @@ void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|||||||
int tp1sq = ntypes * ntypes;
|
int tp1sq = ntypes * ntypes;
|
||||||
int tp1cb = ntypes * ntypes * ntypes;
|
int tp1cb = ntypes * ntypes * ntypes;
|
||||||
int tp1cb_pad = ntypes * ntypes * ntypes_pad;
|
int tp1cb_pad = ntypes * ntypes * ntypes_pad;
|
||||||
if (oc_first_loop != NULL && oc_second_loop != NULL &&
|
if (oc_first_loop != NULL && oc_second_loop != NULL &&
|
||||||
oc_inner_loop != NULL && cop >= 0) {
|
oc_inner_loop != NULL && cop >= 0) {
|
||||||
#pragma offload_transfer target(mic:cop) \
|
#pragma offload_transfer target(mic:cop) \
|
||||||
nocopy(oc_first_loop: length(tp1sq) alloc_if(1) free_if(0)) \
|
nocopy(oc_first_loop: length(tp1sq) alloc_if(1) free_if(0)) \
|
||||||
@ -642,15 +642,15 @@ static const int N_CACHE = 8;
|
|||||||
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
||||||
template<int EFLAG>
|
template<int EFLAG>
|
||||||
void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
||||||
int eatom, int vflag,
|
int eatom, int vflag,
|
||||||
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
||||||
const int * _noalias const firstneigh, int ntypes,
|
const int * _noalias const firstneigh, int ntypes,
|
||||||
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
||||||
const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
|
const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
|
||||||
const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
|
const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
|
||||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||||
avec *vsevdwl,
|
avec *vsevdwl,
|
||||||
int compress_idx,
|
int compress_idx,
|
||||||
iarr is,
|
iarr is,
|
||||||
iarr js,
|
iarr js,
|
||||||
bvec vmask_repulsive
|
bvec vmask_repulsive
|
||||||
@ -662,7 +662,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
|||||||
ivec v_i0(0);
|
ivec v_i0(0);
|
||||||
ivec v_i_ntypes(ntypes);
|
ivec v_i_ntypes(ntypes);
|
||||||
ivec v_i_NEIGHMASK(NEIGHMASK);
|
ivec v_i_NEIGHMASK(NEIGHMASK);
|
||||||
|
|
||||||
farr fx, fy, fz, fw;
|
farr fx, fy, fz, fw;
|
||||||
int cache_idx = 0;
|
int cache_idx = 0;
|
||||||
fvec vfkx_cache[N_CACHE];
|
fvec vfkx_cache[N_CACHE];
|
||||||
@ -672,7 +672,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
|||||||
bvec vmask_cache[N_CACHE];
|
bvec vmask_cache[N_CACHE];
|
||||||
ivec vkks_final_cache;
|
ivec vkks_final_cache;
|
||||||
bvec vmask_final_cache;
|
bvec vmask_final_cache;
|
||||||
iarr ts;
|
iarr ts;
|
||||||
// compute all the stuff we know from i and j
|
// compute all the stuff we know from i and j
|
||||||
// TDO: We could extract this from the driver routine
|
// TDO: We could extract this from the driver routine
|
||||||
ivec vis = v::int_mullo(v_i4floats, v::int_load_vl(is));
|
ivec vis = v::int_mullo(v_i4floats, v::int_load_vl(is));
|
||||||
@ -738,7 +738,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
|||||||
&vfix,&vfiy,&vfiz,
|
&vfix,&vfiy,&vfiz,
|
||||||
&vfjx,&vfjy,&vfjz,
|
&vfjx,&vfjy,&vfjz,
|
||||||
&vfkx,&vfky,&vfkz,
|
&vfkx,&vfky,&vfkz,
|
||||||
&vzeta_contrib);
|
&vzeta_contrib);
|
||||||
vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
|
vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
|
||||||
vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
|
vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
|
||||||
vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
|
vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
|
||||||
@ -749,9 +749,9 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
|||||||
vfkx_cache[cache_idx] = vfkx;
|
vfkx_cache[cache_idx] = vfkx;
|
||||||
vfky_cache[cache_idx] = vfky;
|
vfky_cache[cache_idx] = vfky;
|
||||||
vfkz_cache[cache_idx] = vfkz;
|
vfkz_cache[cache_idx] = vfkz;
|
||||||
vks_cache[cache_idx] = vks;
|
vks_cache[cache_idx] = vks;
|
||||||
vmask_cache[cache_idx] = veff_mask;
|
vmask_cache[cache_idx] = veff_mask;
|
||||||
cache_idx += 1;
|
cache_idx += 1;
|
||||||
|
|
||||||
vzeta = v::mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
|
vzeta = v::mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
|
||||||
vkks = vkks + v_i1;
|
vkks = vkks + v_i1;
|
||||||
@ -799,7 +799,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
|||||||
vfjxtmp = vfjxtmp * vprefactor - vdx_ij * vfpair;
|
vfjxtmp = vfjxtmp * vprefactor - vdx_ij * vfpair;
|
||||||
vfjytmp = vfjytmp * vprefactor - vdy_ij * vfpair;
|
vfjytmp = vfjytmp * vprefactor - vdy_ij * vfpair;
|
||||||
vfjztmp = vfjztmp * vprefactor - vdz_ij * vfpair;
|
vfjztmp = vfjztmp * vprefactor - vdz_ij * vfpair;
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
|
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
@ -833,7 +833,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
|||||||
fvec vx_k, vy_k, vz_k, vcutsq;
|
fvec vx_k, vy_k, vz_k, vcutsq;
|
||||||
while (! v::mask_testz(vactive_mask)) {
|
while (! v::mask_testz(vactive_mask)) {
|
||||||
bvec vnew_mask = vactive_mask & ~ veff_old_mask;
|
bvec vnew_mask = vactive_mask & ~ veff_old_mask;
|
||||||
vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
|
vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
|
||||||
v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
|
v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
|
||||||
v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
|
v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
|
||||||
fvec vdx_ik = vx_k - vx_i;
|
fvec vdx_ik = vx_k - vx_i;
|
||||||
@ -855,7 +855,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
|||||||
&vfix,&vfiy,&vfiz,
|
&vfix,&vfiy,&vfiz,
|
||||||
&vfjx,&vfjy,&vfjz,
|
&vfjx,&vfjy,&vfjz,
|
||||||
&vfkx,&vfky,&vfkz,
|
&vfkx,&vfky,&vfkz,
|
||||||
0);
|
0);
|
||||||
vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
|
vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
|
||||||
vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
|
vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
|
||||||
vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
|
vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
|
||||||
@ -917,15 +917,15 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
|||||||
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
||||||
template<int EFLAG>
|
template<int EFLAG>
|
||||||
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
||||||
int eatom, int vflag,
|
int eatom, int vflag,
|
||||||
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
||||||
const int * _noalias const firstneigh, int ntypes,
|
const int * _noalias const firstneigh, int ntypes,
|
||||||
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
||||||
const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
|
const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
|
||||||
const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
|
const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
|
||||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||||
avec *vsevdwl,
|
avec *vsevdwl,
|
||||||
int compress_idx,
|
int compress_idx,
|
||||||
int i,
|
int i,
|
||||||
iarr js,
|
iarr js,
|
||||||
bvec vmask_repulsive
|
bvec vmask_repulsive
|
||||||
@ -951,7 +951,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
|||||||
int kk_final_cache;
|
int kk_final_cache;
|
||||||
|
|
||||||
aarr fx, fy, fz, fw;
|
aarr fx, fy, fz, fw;
|
||||||
iarr ts;
|
iarr ts;
|
||||||
|
|
||||||
bvec vmask = v::mask_enable_lower(compress_idx);
|
bvec vmask = v::mask_enable_lower(compress_idx);
|
||||||
fvec vx_i(x[i].x), vy_i(x[i].y), vz_i(x[i].z);
|
fvec vx_i(x[i].x), vy_i(x[i].y), vz_i(x[i].z);
|
||||||
@ -997,7 +997,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
|||||||
fvec vfix, vfiy, vfiz;
|
fvec vfix, vfiy, vfiz;
|
||||||
fvec vfjx, vfjy, vfjz;
|
fvec vfjx, vfjy, vfjz;
|
||||||
fvec vfkx, vfky, vfkz;
|
fvec vfkx, vfky, vfkz;
|
||||||
|
|
||||||
attractive_vector<true>(&c_inner[ntypes * ntypes * w_i + w_k],vc_idx_j_ntypes,veff_mask,fvec(1.),
|
attractive_vector<true>(&c_inner[ntypes * ntypes * w_i + w_k],vc_idx_j_ntypes,veff_mask,fvec(1.),
|
||||||
vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
|
vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
|
||||||
&vfix,&vfiy,&vfiz,
|
&vfix,&vfiy,&vfiz,
|
||||||
@ -1010,7 +1010,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
|||||||
vfjxtmp = v::acc_mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
|
vfjxtmp = v::acc_mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
|
||||||
vfjytmp = v::acc_mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
|
vfjytmp = v::acc_mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
|
||||||
vfjztmp = v::acc_mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
|
vfjztmp = v::acc_mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
|
||||||
|
|
||||||
vfkx_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkx, v::zero());
|
vfkx_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkx, v::zero());
|
||||||
vfky_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfky, v::zero());
|
vfky_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfky, v::zero());
|
||||||
vfkz_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkz, v::zero());
|
vfkz_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkz, v::zero());
|
||||||
@ -1037,7 +1037,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
|||||||
bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
|
bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
|
||||||
bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
|
bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
|
||||||
if (! v::mask_testz(veff_mask)) {
|
if (! v::mask_testz(veff_mask)) {
|
||||||
fvec vzeta_contrib = zeta_vector(&c_inner[ntypes * ntypes * w_i + w_k], vc_idx_j_ntypes, veff_mask, vrij, vrsq,
|
fvec vzeta_contrib = zeta_vector(&c_inner[ntypes * ntypes * w_i + w_k], vc_idx_j_ntypes, veff_mask, vrij, vrsq,
|
||||||
vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik);
|
vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik);
|
||||||
vzeta = v::acc_mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
|
vzeta = v::acc_mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
|
||||||
}
|
}
|
||||||
@ -1051,7 +1051,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
|||||||
vfjxtmp = vfjxtmp * vaprefactor - avec(vdx_ij * vfpair);
|
vfjxtmp = vfjxtmp * vaprefactor - avec(vdx_ij * vfpair);
|
||||||
vfjytmp = vfjytmp * vaprefactor - avec(vdy_ij * vfpair);
|
vfjytmp = vfjytmp * vaprefactor - avec(vdy_ij * vfpair);
|
||||||
vfjztmp = vfjztmp * vaprefactor - avec(vdz_ij * vfpair);
|
vfjztmp = vfjztmp * vaprefactor - avec(vdz_ij * vfpair);
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
|
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
@ -1093,7 +1093,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
|||||||
&vfix,&vfiy,&vfiz,
|
&vfix,&vfiy,&vfiz,
|
||||||
&vfjx,&vfjy,&vfjz,
|
&vfjx,&vfjy,&vfjz,
|
||||||
&vfkx,&vfky,&vfkz,
|
&vfkx,&vfky,&vfkz,
|
||||||
0);
|
0);
|
||||||
vfxtmp = v::acc_mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
|
vfxtmp = v::acc_mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
|
||||||
vfytmp = v::acc_mask_add(vfytmp, veff_mask, vfytmp, vfiy);
|
vfytmp = v::acc_mask_add(vfytmp, veff_mask, vfytmp, vfiy);
|
||||||
vfztmp = v::acc_mask_add(vfztmp, veff_mask, vfztmp, vfiz);
|
vfztmp = v::acc_mask_add(vfztmp, veff_mask, vfztmp, vfiz);
|
||||||
@ -1129,14 +1129,14 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
|||||||
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
||||||
template<bool EFLAG>
|
template<bool EFLAG>
|
||||||
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
||||||
int iito, int iifrom, int eatom, int vflag,
|
int iito, int iifrom, int eatom, int vflag,
|
||||||
const int * _noalias const numneigh,
|
const int * _noalias const numneigh,
|
||||||
const int * _noalias const numneighhalf,
|
const int * _noalias const numneighhalf,
|
||||||
const int * _noalias const cnumneigh,
|
const int * _noalias const cnumneigh,
|
||||||
const int * _noalias const firstneigh, int ntypes,
|
const int * _noalias const firstneigh, int ntypes,
|
||||||
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
|
||||||
const c_inner_t * _noalias const c_inner,
|
const c_inner_t * _noalias const c_inner,
|
||||||
const c_outer_t * _noalias const c_outer,
|
const c_outer_t * _noalias const c_outer,
|
||||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||||
acc_t *evdwl
|
acc_t *evdwl
|
||||||
) {
|
) {
|
||||||
@ -1181,10 +1181,10 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
|||||||
if (compress_idx == v::VL) {
|
if (compress_idx == v::VL) {
|
||||||
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
||||||
kernel_step<EFLAG>(
|
kernel_step<EFLAG>(
|
||||||
eatom, vflag,
|
eatom, vflag,
|
||||||
numneigh, cnumneigh, firstneigh, ntypes,
|
numneigh, cnumneigh, firstneigh, ntypes,
|
||||||
x, c_inner, c_outer, f,
|
x, c_inner, c_outer, f,
|
||||||
&vsevdwl, compress_idx,
|
&vsevdwl, compress_idx,
|
||||||
is, js, vmask_repulsive
|
is, js, vmask_repulsive
|
||||||
);
|
);
|
||||||
compress_idx = 0;
|
compress_idx = 0;
|
||||||
@ -1194,10 +1194,10 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
|||||||
if (compress_idx == v::VL || (compress_idx > 0 && jj == jnum-1)) {
|
if (compress_idx == v::VL || (compress_idx > 0 && jj == jnum-1)) {
|
||||||
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
||||||
kernel_step_const_i<EFLAG>(
|
kernel_step_const_i<EFLAG>(
|
||||||
eatom, vflag,
|
eatom, vflag,
|
||||||
numneigh, cnumneigh, firstneigh, ntypes,
|
numneigh, cnumneigh, firstneigh, ntypes,
|
||||||
x, c_inner, c_outer, f,
|
x, c_inner, c_outer, f,
|
||||||
&vsevdwl, compress_idx,
|
&vsevdwl, compress_idx,
|
||||||
i, js, vmask_repulsive
|
i, js, vmask_repulsive
|
||||||
);
|
);
|
||||||
compress_idx = 0;
|
compress_idx = 0;
|
||||||
@ -1209,10 +1209,10 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
|||||||
if (compress_idx > 0) {
|
if (compress_idx > 0) {
|
||||||
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
||||||
IntelKernelTersoff::kernel_step<EFLAG>(
|
IntelKernelTersoff::kernel_step<EFLAG>(
|
||||||
eatom, vflag,
|
eatom, vflag,
|
||||||
numneigh, cnumneigh, firstneigh, ntypes,
|
numneigh, cnumneigh, firstneigh, ntypes,
|
||||||
x, c_inner, c_outer, f,
|
x, c_inner, c_outer, f,
|
||||||
&vsevdwl, compress_idx,
|
&vsevdwl, compress_idx,
|
||||||
is, js, vmask_repulsive
|
is, js, vmask_repulsive
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -1224,10 +1224,10 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
|||||||
|
|
||||||
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
||||||
IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::zeta_vector(
|
IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::zeta_vector(
|
||||||
const c_inner_t * param,
|
const c_inner_t * param,
|
||||||
ivec xjw, bvec mask,
|
ivec xjw, bvec mask,
|
||||||
fvec vrij, fvec rsq2,
|
fvec vrij, fvec rsq2,
|
||||||
fvec vdijx, fvec vdijy, fvec vdijz,
|
fvec vdijx, fvec vdijy, fvec vdijz,
|
||||||
fvec dikx, fvec diky, fvec dikz
|
fvec dikx, fvec diky, fvec dikz
|
||||||
) {
|
) {
|
||||||
fvec v_1_0(1.0);
|
fvec v_1_0(1.0);
|
||||||
@ -1250,7 +1250,7 @@ IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t
|
|||||||
// Its kind of important to check the mask.
|
// Its kind of important to check the mask.
|
||||||
// Some simulations never/rarely invoke this branch.
|
// Some simulations never/rarely invoke this branch.
|
||||||
if (! v::mask_testz(vmask_need_sine)) {
|
if (! v::mask_testz(vmask_need_sine)) {
|
||||||
vfc = v::blend(vmask_need_sine, vfc,
|
vfc = v::blend(vmask_need_sine, vfc,
|
||||||
v_0_5 * (v_1_0 - sin(fvec(MY_PI2) * (vrik - vpbigr) * v::recip(vpbigd))));
|
v_0_5 * (v_1_0 - sin(fvec(MY_PI2) * (vrik - vpbigr) * v::recip(vpbigd))));
|
||||||
}
|
}
|
||||||
return vgijk * vex_delr * vfc;
|
return vgijk * vex_delr * vfc;
|
||||||
@ -1258,7 +1258,7 @@ IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t
|
|||||||
|
|
||||||
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
||||||
void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::force_zeta_vector(
|
void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::force_zeta_vector(
|
||||||
const c_outer_t * param,
|
const c_outer_t * param,
|
||||||
ivec xjw,
|
ivec xjw,
|
||||||
bvec mask,
|
bvec mask,
|
||||||
fvec vrij, fvec vzeta_ij,
|
fvec vrij, fvec vzeta_ij,
|
||||||
@ -1402,9 +1402,9 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
|
|||||||
vfc_d = v::blend(vmask_need_sine, vfc_d, fvec(-0.5) * vtmp * vfccos);
|
vfc_d = v::blend(vmask_need_sine, vfc_d, fvec(-0.5) * vtmp * vfccos);
|
||||||
}
|
}
|
||||||
|
|
||||||
fvec vzeta_d_fc = vfc_d * vgijk * vex_delr;
|
fvec vzeta_d_fc = vfc_d * vgijk * vex_delr;
|
||||||
fvec vzeta_d_gijk = vfc * vgijk_d * vex_delr;
|
fvec vzeta_d_gijk = vfc * vgijk_d * vex_delr;
|
||||||
fvec vzeta_d_ex_delr = vfc * vgijk * vex_delr_d;
|
fvec vzeta_d_ex_delr = vfc * vgijk * vex_delr_d;
|
||||||
if (ZETA) *zeta = vfc * vgijk * vex_delr;
|
if (ZETA) *zeta = vfc * vgijk * vex_delr;
|
||||||
|
|
||||||
fvec vminus_costheta = - vcostheta;
|
fvec vminus_costheta = - vcostheta;
|
||||||
@ -1417,7 +1417,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
|
|||||||
fvec vdcosdrix = -(vdcosdrjx + vdcosdrkx);
|
fvec vdcosdrix = -(vdcosdrjx + vdcosdrkx);
|
||||||
fvec vdcosdriy = -(vdcosdrjy + vdcosdrky);
|
fvec vdcosdriy = -(vdcosdrjy + vdcosdrky);
|
||||||
fvec vdcosdriz = -(vdcosdrjz + vdcosdrkz);
|
fvec vdcosdriz = -(vdcosdrjz + vdcosdrkz);
|
||||||
|
|
||||||
*fix = vprefactor * (vzeta_d_gijk * vdcosdrix + vzeta_d_ex_delr * (rik_hatx - vrij_hatx) - vzeta_d_fc * rik_hatx);
|
*fix = vprefactor * (vzeta_d_gijk * vdcosdrix + vzeta_d_ex_delr * (rik_hatx - vrij_hatx) - vzeta_d_fc * rik_hatx);
|
||||||
*fiy = vprefactor * (vzeta_d_gijk * vdcosdriy + vzeta_d_ex_delr * (rik_haty - vrij_haty) - vzeta_d_fc * rik_haty);
|
*fiy = vprefactor * (vzeta_d_gijk * vdcosdriy + vzeta_d_ex_delr * (rik_haty - vrij_haty) - vzeta_d_fc * rik_haty);
|
||||||
*fiz = vprefactor * (vzeta_d_gijk * vdcosdriz + vzeta_d_ex_delr * (rik_hatz - vrij_hatz) - vzeta_d_fc * rik_hatz);
|
*fiz = vprefactor * (vzeta_d_gijk * vdcosdriz + vzeta_d_ex_delr * (rik_hatz - vrij_hatz) - vzeta_d_fc * rik_hatz);
|
||||||
|
|||||||
@ -75,14 +75,14 @@ class PairTersoffIntel : public PairTersoff {
|
|||||||
};
|
};
|
||||||
ForceConst<float> force_const_single;
|
ForceConst<float> force_const_single;
|
||||||
ForceConst<double> force_const_double;
|
ForceConst<double> force_const_double;
|
||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EFLAG, class flt_t, class acc_t>
|
template <int EFLAG, class flt_t, class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void pack_force_const(ForceConst<flt_t> &fc,
|
void pack_force_const(ForceConst<flt_t> &fc,
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -1,238 +1,238 @@
|
|||||||
/* -*- c++ -*- ----------------------------------------------------------
|
/* -*- c++ -*- ----------------------------------------------------------
|
||||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||||
http://lammps.sandia.gov, Sandia National Laboratories
|
http://lammps.sandia.gov, Sandia National Laboratories
|
||||||
Steve Plimpton, sjplimp@sandia.gov
|
Steve Plimpton, sjplimp@sandia.gov
|
||||||
|
|
||||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||||
certain rights in this software. This software is distributed under
|
certain rights in this software. This software is distributed under
|
||||||
the GNU General Public License.
|
the GNU General Public License.
|
||||||
|
|
||||||
See the README file in the top-level LAMMPS directory.
|
See the README file in the top-level LAMMPS directory.
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
Contributing authors: William McDoniel (RWTH Aachen University)
|
Contributing authors: William McDoniel (RWTH Aachen University)
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
#ifdef KSPACE_CLASS
|
#ifdef KSPACE_CLASS
|
||||||
|
|
||||||
KSpaceStyle(pppm/disp/intel,PPPMDispIntel)
|
KSpaceStyle(pppm/disp/intel,PPPMDispIntel)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifndef LMP_PPPMINTEL_DISP_H
|
#ifndef LMP_PPPMINTEL_DISP_H
|
||||||
#define LMP_PPPMINTEL_DISP_H
|
#define LMP_PPPMINTEL_DISP_H
|
||||||
|
|
||||||
#include "pppm_disp.h"
|
#include "pppm_disp.h"
|
||||||
#include "fix_intel.h"
|
#include "fix_intel.h"
|
||||||
|
|
||||||
namespace LAMMPS_NS {
|
namespace LAMMPS_NS {
|
||||||
|
|
||||||
class PPPMDispIntel : public PPPMDisp {
|
class PPPMDispIntel : public PPPMDisp {
|
||||||
public:
|
public:
|
||||||
PPPMDispIntel(class LAMMPS *, int, char **);
|
PPPMDispIntel(class LAMMPS *, int, char **);
|
||||||
virtual ~PPPMDispIntel();
|
virtual ~PPPMDispIntel();
|
||||||
virtual void init();
|
virtual void init();
|
||||||
virtual void compute(int, int);
|
virtual void compute(int, int);
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
int use_base();
|
int use_base();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
FixIntel *fix;
|
FixIntel *fix;
|
||||||
|
|
||||||
int _use_lrt;
|
int _use_lrt;
|
||||||
FFT_SCALAR **perthread_density;
|
FFT_SCALAR **perthread_density;
|
||||||
FFT_SCALAR *particle_ekx;
|
FFT_SCALAR *particle_ekx;
|
||||||
FFT_SCALAR *particle_eky;
|
FFT_SCALAR *particle_eky;
|
||||||
FFT_SCALAR *particle_ekz;
|
FFT_SCALAR *particle_ekz;
|
||||||
FFT_SCALAR *particle_ekx0;
|
FFT_SCALAR *particle_ekx0;
|
||||||
FFT_SCALAR *particle_eky0;
|
FFT_SCALAR *particle_eky0;
|
||||||
FFT_SCALAR *particle_ekz0;
|
FFT_SCALAR *particle_ekz0;
|
||||||
FFT_SCALAR *particle_ekx1;
|
FFT_SCALAR *particle_ekx1;
|
||||||
FFT_SCALAR *particle_eky1;
|
FFT_SCALAR *particle_eky1;
|
||||||
FFT_SCALAR *particle_ekz1;
|
FFT_SCALAR *particle_ekz1;
|
||||||
FFT_SCALAR *particle_ekx2;
|
FFT_SCALAR *particle_ekx2;
|
||||||
FFT_SCALAR *particle_eky2;
|
FFT_SCALAR *particle_eky2;
|
||||||
FFT_SCALAR *particle_ekz2;
|
FFT_SCALAR *particle_ekz2;
|
||||||
FFT_SCALAR *particle_ekx3;
|
FFT_SCALAR *particle_ekx3;
|
||||||
FFT_SCALAR *particle_eky3;
|
FFT_SCALAR *particle_eky3;
|
||||||
FFT_SCALAR *particle_ekz3;
|
FFT_SCALAR *particle_ekz3;
|
||||||
FFT_SCALAR *particle_ekx4;
|
FFT_SCALAR *particle_ekx4;
|
||||||
FFT_SCALAR *particle_eky4;
|
FFT_SCALAR *particle_eky4;
|
||||||
FFT_SCALAR *particle_ekz4;
|
FFT_SCALAR *particle_ekz4;
|
||||||
FFT_SCALAR *particle_ekx5;
|
FFT_SCALAR *particle_ekx5;
|
||||||
FFT_SCALAR *particle_eky5;
|
FFT_SCALAR *particle_eky5;
|
||||||
FFT_SCALAR *particle_ekz5;
|
FFT_SCALAR *particle_ekz5;
|
||||||
FFT_SCALAR *particle_ekx6;
|
FFT_SCALAR *particle_ekx6;
|
||||||
FFT_SCALAR *particle_eky6;
|
FFT_SCALAR *particle_eky6;
|
||||||
FFT_SCALAR *particle_ekz6;
|
FFT_SCALAR *particle_ekz6;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
int _use_table;
|
int _use_table;
|
||||||
int rho_points;
|
int rho_points;
|
||||||
FFT_SCALAR **rho_lookup;
|
FFT_SCALAR **rho_lookup;
|
||||||
FFT_SCALAR **rho6_lookup;
|
FFT_SCALAR **rho6_lookup;
|
||||||
FFT_SCALAR **drho_lookup;
|
FFT_SCALAR **drho_lookup;
|
||||||
FFT_SCALAR **drho6_lookup;
|
FFT_SCALAR **drho6_lookup;
|
||||||
FFT_SCALAR half_rho_scale, half_rho_scale_plus;
|
FFT_SCALAR half_rho_scale, half_rho_scale_plus;
|
||||||
|
|
||||||
int _use_packing;
|
int _use_packing;
|
||||||
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
int _use_base;
|
int _use_base;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
void particle_map(double, double, double,
|
void particle_map(double, double, double,
|
||||||
double, int **, int, int,
|
double, int **, int, int,
|
||||||
int, int, int,
|
int, int, int,
|
||||||
int, int, int,
|
int, int, int,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers);
|
IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
|
||||||
template<class flt_t, class acc_t, int use_table>
|
template<class flt_t, class acc_t, int use_table>
|
||||||
void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers);
|
void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers) {
|
void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
if (_use_table == 1) {
|
if (_use_table == 1) {
|
||||||
make_rho_c<flt_t,acc_t,1>(buffers);
|
make_rho_c<flt_t,acc_t,1>(buffers);
|
||||||
} else {
|
} else {
|
||||||
make_rho_c<flt_t,acc_t,0>(buffers);
|
make_rho_c<flt_t,acc_t,0>(buffers);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class flt_t, class acc_t, int use_table>
|
template<class flt_t, class acc_t, int use_table>
|
||||||
void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers);
|
void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers) {
|
void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
if (_use_table == 1) {
|
if (_use_table == 1) {
|
||||||
make_rho_g<flt_t,acc_t,1>(buffers);
|
make_rho_g<flt_t,acc_t,1>(buffers);
|
||||||
} else {
|
} else {
|
||||||
make_rho_g<flt_t,acc_t,0>(buffers);
|
make_rho_g<flt_t,acc_t,0>(buffers);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class flt_t, class acc_t, int use_table>
|
template<class flt_t, class acc_t, int use_table>
|
||||||
void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers);
|
void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers) {
|
void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
if (_use_table == 1) {
|
if (_use_table == 1) {
|
||||||
make_rho_a<flt_t,acc_t,1>(buffers);
|
make_rho_a<flt_t,acc_t,1>(buffers);
|
||||||
} else {
|
} else {
|
||||||
make_rho_a<flt_t,acc_t,0>(buffers);
|
make_rho_a<flt_t,acc_t,0>(buffers);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class flt_t, class acc_t, int use_table>
|
template<class flt_t, class acc_t, int use_table>
|
||||||
void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers);
|
void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers) {
|
void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
if (_use_table == 1) {
|
if (_use_table == 1) {
|
||||||
make_rho_none<flt_t,acc_t,1>(buffers);
|
make_rho_none<flt_t,acc_t,1>(buffers);
|
||||||
} else {
|
} else {
|
||||||
make_rho_none<flt_t,acc_t,0>(buffers);
|
make_rho_none<flt_t,acc_t,0>(buffers);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class flt_t, class acc_t, int use_table>
|
template<class flt_t, class acc_t, int use_table>
|
||||||
void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
if (_use_table == 1) {
|
if (_use_table == 1) {
|
||||||
fieldforce_c_ik<flt_t,acc_t,1>(buffers);
|
fieldforce_c_ik<flt_t,acc_t,1>(buffers);
|
||||||
} else {
|
} else {
|
||||||
fieldforce_c_ik<flt_t,acc_t,0>(buffers);
|
fieldforce_c_ik<flt_t,acc_t,0>(buffers);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class flt_t, class acc_t, int use_table>
|
template<class flt_t, class acc_t, int use_table>
|
||||||
void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
if (_use_table == 1) {
|
if (_use_table == 1) {
|
||||||
fieldforce_c_ad<flt_t,acc_t,1>(buffers);
|
fieldforce_c_ad<flt_t,acc_t,1>(buffers);
|
||||||
} else {
|
} else {
|
||||||
fieldforce_c_ad<flt_t,acc_t,0>(buffers);
|
fieldforce_c_ad<flt_t,acc_t,0>(buffers);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class flt_t, class acc_t, int use_table>
|
template<class flt_t, class acc_t, int use_table>
|
||||||
void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
if (_use_table == 1) {
|
if (_use_table == 1) {
|
||||||
fieldforce_g_ik<flt_t,acc_t,1>(buffers);
|
fieldforce_g_ik<flt_t,acc_t,1>(buffers);
|
||||||
} else {
|
} else {
|
||||||
fieldforce_g_ik<flt_t,acc_t,0>(buffers);
|
fieldforce_g_ik<flt_t,acc_t,0>(buffers);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class flt_t, class acc_t, int use_table>
|
template<class flt_t, class acc_t, int use_table>
|
||||||
void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
if (_use_table == 1) {
|
if (_use_table == 1) {
|
||||||
fieldforce_g_ad<flt_t,acc_t,1>(buffers);
|
fieldforce_g_ad<flt_t,acc_t,1>(buffers);
|
||||||
} else {
|
} else {
|
||||||
fieldforce_g_ad<flt_t,acc_t,0>(buffers);
|
fieldforce_g_ad<flt_t,acc_t,0>(buffers);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class flt_t, class acc_t, int use_table>
|
template<class flt_t, class acc_t, int use_table>
|
||||||
void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
if (_use_table == 1) {
|
if (_use_table == 1) {
|
||||||
fieldforce_a_ik<flt_t,acc_t,1>(buffers);
|
fieldforce_a_ik<flt_t,acc_t,1>(buffers);
|
||||||
} else {
|
} else {
|
||||||
fieldforce_a_ik<flt_t,acc_t,0>(buffers);
|
fieldforce_a_ik<flt_t,acc_t,0>(buffers);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class flt_t, class acc_t, int use_table>
|
template<class flt_t, class acc_t, int use_table>
|
||||||
void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
if (_use_table == 1) {
|
if (_use_table == 1) {
|
||||||
fieldforce_a_ad<flt_t,acc_t,1>(buffers);
|
fieldforce_a_ad<flt_t,acc_t,1>(buffers);
|
||||||
} else {
|
} else {
|
||||||
fieldforce_a_ad<flt_t,acc_t,0>(buffers);
|
fieldforce_a_ad<flt_t,acc_t,0>(buffers);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class flt_t, class acc_t, int use_table>
|
template<class flt_t, class acc_t, int use_table>
|
||||||
void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
if (_use_table == 1) {
|
if (_use_table == 1) {
|
||||||
fieldforce_none_ik<flt_t,acc_t,1>(buffers);
|
fieldforce_none_ik<flt_t,acc_t,1>(buffers);
|
||||||
} else {
|
} else {
|
||||||
fieldforce_none_ik<flt_t,acc_t,0>(buffers);
|
fieldforce_none_ik<flt_t,acc_t,0>(buffers);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class flt_t, class acc_t, int use_table>
|
template<class flt_t, class acc_t, int use_table>
|
||||||
void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
if (_use_table == 1) {
|
if (_use_table == 1) {
|
||||||
fieldforce_none_ad<flt_t,acc_t,1>(buffers);
|
fieldforce_none_ad<flt_t,acc_t,1>(buffers);
|
||||||
} else {
|
} else {
|
||||||
fieldforce_none_ad<flt_t,acc_t,0>(buffers);
|
fieldforce_none_ad<flt_t,acc_t,0>(buffers);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void precompute_rho();
|
void precompute_rho();
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -14,7 +14,7 @@
|
|||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
Contributing authors: William McDoniel (RWTH Aachen University)
|
Contributing authors: William McDoniel (RWTH Aachen University)
|
||||||
Rodrigo Canales (RWTH Aachen University)
|
Rodrigo Canales (RWTH Aachen University)
|
||||||
Markus Hoehnerbach (RWTH Aachen University)
|
Markus Hoehnerbach (RWTH Aachen University)
|
||||||
W. Michael Brown (Intel)
|
W. Michael Brown (Intel)
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
@ -62,10 +62,10 @@ PPPMIntel::PPPMIntel(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg)
|
|||||||
|
|
||||||
perthread_density = NULL;
|
perthread_density = NULL;
|
||||||
particle_ekx = particle_eky = particle_ekz = NULL;
|
particle_ekx = particle_eky = particle_ekz = NULL;
|
||||||
|
|
||||||
rho_lookup = drho_lookup = NULL;
|
rho_lookup = drho_lookup = NULL;
|
||||||
rho_points = 0;
|
rho_points = 0;
|
||||||
|
|
||||||
vdxy_brick = vdz0_brick = NULL;
|
vdxy_brick = vdz0_brick = NULL;
|
||||||
work3 = NULL;
|
work3 = NULL;
|
||||||
cg_pack = NULL;
|
cg_pack = NULL;
|
||||||
@ -120,20 +120,20 @@ void PPPMIntel::init()
|
|||||||
if ((comm->nthreads > 1) && !_use_lrt) {
|
if ((comm->nthreads > 1) && !_use_lrt) {
|
||||||
memory->destroy(perthread_density);
|
memory->destroy(perthread_density);
|
||||||
memory->create(perthread_density, comm->nthreads-1,
|
memory->create(perthread_density, comm->nthreads-1,
|
||||||
ngrid + INTEL_P3M_ALIGNED_MAXORDER,
|
ngrid + INTEL_P3M_ALIGNED_MAXORDER,
|
||||||
"pppmintel:perthread_density");
|
"pppmintel:perthread_density");
|
||||||
}
|
}
|
||||||
|
|
||||||
_use_table = fix->pppm_table();
|
_use_table = fix->pppm_table();
|
||||||
if (_use_table) {
|
if (_use_table) {
|
||||||
rho_points = 5000;
|
rho_points = 5000;
|
||||||
memory->destroy(rho_lookup);
|
memory->destroy(rho_lookup);
|
||||||
memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
|
memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
|
||||||
"pppmintel:rho_lookup");
|
"pppmintel:rho_lookup");
|
||||||
if(differentiation_flag == 1) {
|
if(differentiation_flag == 1) {
|
||||||
memory->destroy(drho_lookup);
|
memory->destroy(drho_lookup);
|
||||||
memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
|
memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
|
||||||
"pppmintel:drho_lookup");
|
"pppmintel:drho_lookup");
|
||||||
}
|
}
|
||||||
precompute_rho();
|
precompute_rho();
|
||||||
}
|
}
|
||||||
@ -141,7 +141,7 @@ void PPPMIntel::init()
|
|||||||
if (order > INTEL_P3M_MAXORDER)
|
if (order > INTEL_P3M_MAXORDER)
|
||||||
error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n");
|
error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n");
|
||||||
|
|
||||||
_use_packing = (order == 7) && (INTEL_VECTOR_WIDTH == 16)
|
_use_packing = (order == 7) && (INTEL_VECTOR_WIDTH == 16)
|
||||||
&& (sizeof(FFT_SCALAR) == sizeof(float))
|
&& (sizeof(FFT_SCALAR) == sizeof(float))
|
||||||
&& (differentiation_flag == 0);
|
&& (differentiation_flag == 0);
|
||||||
if (_use_packing) {
|
if (_use_packing) {
|
||||||
@ -149,13 +149,13 @@ void PPPMIntel::init()
|
|||||||
memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
|
memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
|
||||||
memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
|
memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
|
||||||
memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
|
memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
|
||||||
memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
|
memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
|
||||||
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
|
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
|
||||||
"pppmintel:vdxy_brick");
|
"pppmintel:vdxy_brick");
|
||||||
memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
|
memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
|
||||||
memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
|
memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
|
||||||
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
|
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
|
||||||
"pppmintel:vdz0_brick");
|
"pppmintel:vdz0_brick");
|
||||||
memory->destroy(work3);
|
memory->destroy(work3);
|
||||||
memory->create(work3, 2*nfft_both, "pppmintel:work3");
|
memory->create(work3, 2*nfft_both, "pppmintel:work3");
|
||||||
|
|
||||||
@ -163,10 +163,10 @@ void PPPMIntel::init()
|
|||||||
delete cg_pack;
|
delete cg_pack;
|
||||||
int (*procneigh)[2] = comm->procneigh;
|
int (*procneigh)[2] = comm->procneigh;
|
||||||
cg_pack = new GridComm(lmp,world,2,0, 2*nxlo_in,2*nxhi_in+1,nylo_in,
|
cg_pack = new GridComm(lmp,world,2,0, 2*nxlo_in,2*nxhi_in+1,nylo_in,
|
||||||
nyhi_in,nzlo_in,nzhi_in, 2*nxlo_out,2*nxhi_out+1,
|
nyhi_in,nzlo_in,nzhi_in, 2*nxlo_out,2*nxhi_out+1,
|
||||||
nylo_out,nyhi_out,nzlo_out,nzhi_out,
|
nylo_out,nyhi_out,nzlo_out,nzhi_out,
|
||||||
procneigh[0][0],procneigh[0][1],procneigh[1][0],
|
procneigh[0][0],procneigh[0][1],procneigh[1][0],
|
||||||
procneigh[1][1],procneigh[2][0],procneigh[2][1]);
|
procneigh[1][1],procneigh[2][0],procneigh[2][1]);
|
||||||
|
|
||||||
cg_pack->ghost_notify();
|
cg_pack->ghost_notify();
|
||||||
cg_pack->setup();
|
cg_pack->setup();
|
||||||
@ -484,7 +484,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
{
|
{
|
||||||
const int nix = nxhi_out - nxlo_out + 1;
|
const int nix = nxhi_out - nxlo_out + 1;
|
||||||
const int niy = nyhi_out - nylo_out + 1;
|
const int niy = nyhi_out - nylo_out + 1;
|
||||||
|
|
||||||
const flt_t lo0 = boxlo[0];
|
const flt_t lo0 = boxlo[0];
|
||||||
const flt_t lo1 = boxlo[1];
|
const flt_t lo1 = boxlo[1];
|
||||||
const flt_t lo2 = boxlo[2];
|
const flt_t lo2 = boxlo[2];
|
||||||
@ -503,7 +503,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
memset(my_density, 0, ngrid * sizeof(FFT_SCALAR));
|
memset(my_density, 0, ngrid * sizeof(FFT_SCALAR));
|
||||||
|
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
|
|
||||||
int nx = part2grid[i][0];
|
int nx = part2grid[i][0];
|
||||||
int ny = part2grid[i][1];
|
int ny = part2grid[i][1];
|
||||||
int nz = part2grid[i][2];
|
int nz = part2grid[i][2];
|
||||||
@ -515,9 +515,9 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
|
FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
|
||||||
FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
|
FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
|
||||||
FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
|
FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
|
||||||
|
|
||||||
_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
||||||
|
|
||||||
if (use_table) {
|
if (use_table) {
|
||||||
dx = dx*half_rho_scale + half_rho_scale_plus;
|
dx = dx*half_rho_scale + half_rho_scale_plus;
|
||||||
int idx = dx;
|
int idx = dx;
|
||||||
@ -527,7 +527,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
int idz = dz;
|
int idz = dz;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho[0][k] = rho_lookup[idx][k];
|
rho[0][k] = rho_lookup[idx][k];
|
||||||
rho[1][k] = rho_lookup[idy][k];
|
rho[1][k] = rho_lookup[idy][k];
|
||||||
@ -536,11 +536,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower; k <= nupper; k++) {
|
for (int k = nlower; k <= nupper; k++) {
|
||||||
FFT_SCALAR r1,r2,r3;
|
FFT_SCALAR r1,r2,r3;
|
||||||
r1 = r2 = r3 = ZEROF;
|
r1 = r2 = r3 = ZEROF;
|
||||||
|
|
||||||
for (int l = order-1; l >= 0; l--) {
|
for (int l = order-1; l >= 0; l--) {
|
||||||
r1 = rho_coeff[l][k] + r1*dx;
|
r1 = rho_coeff[l][k] + r1*dx;
|
||||||
r2 = rho_coeff[l][k] + r2*dy;
|
r2 = rho_coeff[l][k] + r2*dy;
|
||||||
@ -551,24 +551,24 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
rho[2][k-nlower] = r3;
|
rho[2][k-nlower] = r3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
FFT_SCALAR z0 = fdelvolinv * q[i];
|
FFT_SCALAR z0 = fdelvolinv * q[i];
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count=7
|
#pragma loop_count=7
|
||||||
#endif
|
#endif
|
||||||
for (int n = 0; n < order; n++) {
|
for (int n = 0; n < order; n++) {
|
||||||
int mz = n*nix*niy + nzsum;
|
int mz = n*nix*niy + nzsum;
|
||||||
FFT_SCALAR y0 = z0*rho[2][n];
|
FFT_SCALAR y0 = z0*rho[2][n];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count=7
|
#pragma loop_count=7
|
||||||
#endif
|
#endif
|
||||||
for (int m = 0; m < order; m++) {
|
for (int m = 0; m < order; m++) {
|
||||||
int mzy = m*nix + mz;
|
int mzy = m*nix + mz;
|
||||||
FFT_SCALAR x0 = y0*rho[1][m];
|
FFT_SCALAR x0 = y0*rho[1][m];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||||
int mzyx = l + mzy;
|
int mzyx = l + mzy;
|
||||||
my_density[mzyx] += x0*rho[0][l];
|
my_density[mzyx] += x0*rho[0][l];
|
||||||
@ -709,21 +709,21 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count=7
|
#pragma loop_count=7
|
||||||
#endif
|
#endif
|
||||||
for (int n = 0; n < order; n++) {
|
for (int n = 0; n < order; n++) {
|
||||||
int mz = n+nzsum;
|
int mz = n+nzsum;
|
||||||
FFT_SCALAR z0 = rho2[n];
|
FFT_SCALAR z0 = rho2[n];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count=7
|
#pragma loop_count=7
|
||||||
#endif
|
#endif
|
||||||
for (int m = 0; m < order; m++) {
|
for (int m = 0; m < order; m++) {
|
||||||
int my = m+nysum;
|
int my = m+nysum;
|
||||||
FFT_SCALAR y0 = z0*rho1[m];
|
FFT_SCALAR y0 = z0*rho1[m];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < (use_packing ? 2 : 1) *
|
for (int l = 0; l < (use_packing ? 2 : 1) *
|
||||||
INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||||
int mx = l+nxsum;
|
int mx = l+nxsum;
|
||||||
FFT_SCALAR x0 = y0*rho0[l];
|
FFT_SCALAR x0 = y0*rho0[l];
|
||||||
if (use_packing) {
|
if (use_packing) {
|
||||||
@ -824,13 +824,13 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
const flt_t fsf_coeff3 = sf_coeff[3];
|
const flt_t fsf_coeff3 = sf_coeff[3];
|
||||||
const flt_t fsf_coeff4 = sf_coeff[4];
|
const flt_t fsf_coeff4 = sf_coeff[4];
|
||||||
const flt_t fsf_coeff5 = sf_coeff[5];
|
const flt_t fsf_coeff5 = sf_coeff[5];
|
||||||
|
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
|
IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
|
||||||
|
|
||||||
_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
_alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
||||||
_alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
_alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
||||||
|
|
||||||
for (int i = ifrom; i < ito; i++) {
|
for (int i = ifrom; i < ito; i++) {
|
||||||
int nx = part2grid[i][0];
|
int nx = part2grid[i][0];
|
||||||
int ny = part2grid[i][1];
|
int ny = part2grid[i][1];
|
||||||
@ -838,11 +838,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
|
FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
|
||||||
FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
|
FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
|
||||||
FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
|
FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
|
||||||
|
|
||||||
int nxsum = nx + nlower;
|
int nxsum = nx + nlower;
|
||||||
int nysum = ny + nlower;
|
int nysum = ny + nlower;
|
||||||
int nzsum = nz + nlower;
|
int nzsum = nz + nlower;
|
||||||
|
|
||||||
if (use_table) {
|
if (use_table) {
|
||||||
dx = dx*half_rho_scale + half_rho_scale_plus;
|
dx = dx*half_rho_scale + half_rho_scale_plus;
|
||||||
int idx = dx;
|
int idx = dx;
|
||||||
@ -852,7 +852,7 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
int idz = dz;
|
int idz = dz;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
|
||||||
rho[0][k] = rho_lookup[idx][k];
|
rho[0][k] = rho_lookup[idx][k];
|
||||||
rho[1][k] = rho_lookup[idy][k];
|
rho[1][k] = rho_lookup[idy][k];
|
||||||
@ -864,11 +864,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
} else {
|
} else {
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
for (int k = nlower; k <= nupper; k++) {
|
for (int k = nlower; k <= nupper; k++) {
|
||||||
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
|
||||||
dr1 = dr2 = dr3 = ZEROF;
|
dr1 = dr2 = dr3 = ZEROF;
|
||||||
|
|
||||||
r1 = rho_coeff[order-1][k];
|
r1 = rho_coeff[order-1][k];
|
||||||
r2 = rho_coeff[order-1][k];
|
r2 = rho_coeff[order-1][k];
|
||||||
r3 = rho_coeff[order-1][k];
|
r3 = rho_coeff[order-1][k];
|
||||||
@ -888,21 +888,21 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
drho[2][k-nlower] = dr3;
|
drho[2][k-nlower] = dr3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
_alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
_alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
||||||
_alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
_alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
||||||
_alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
_alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
||||||
|
|
||||||
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
|
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count=7
|
#pragma loop_count=7
|
||||||
#endif
|
#endif
|
||||||
for (int n = 0; n < order; n++) {
|
for (int n = 0; n < order; n++) {
|
||||||
int mz = n + nzsum;
|
int mz = n + nzsum;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma loop_count=7
|
#pragma loop_count=7
|
||||||
#endif
|
#endif
|
||||||
for (int m = 0; m < order; m++) {
|
for (int m = 0; m < order; m++) {
|
||||||
int my = m + nysum;
|
int my = m + nysum;
|
||||||
FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
|
FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
|
||||||
@ -910,7 +910,7 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||||
int mx = l + nxsum;
|
int mx = l + nxsum;
|
||||||
ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
|
ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
|
||||||
@ -919,17 +919,17 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
|
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
|
||||||
particle_ekx[i] += ekx[l];
|
particle_ekx[i] += ekx[l];
|
||||||
particle_eky[i] += eky[l];
|
particle_eky[i] += eky[l];
|
||||||
particle_ekz[i] += ekz[l];
|
particle_ekz[i] += ekz[l];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma simd
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
@ -937,12 +937,12 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
particle_ekx[i] *= hx_inv;
|
particle_ekx[i] *= hx_inv;
|
||||||
particle_eky[i] *= hy_inv;
|
particle_eky[i] *= hy_inv;
|
||||||
particle_ekz[i] *= hz_inv;
|
particle_ekz[i] *= hz_inv;
|
||||||
|
|
||||||
// convert E-field to force
|
// convert E-field to force
|
||||||
|
|
||||||
const flt_t qfactor = fqqrd2es * q[i];
|
const flt_t qfactor = fqqrd2es * q[i];
|
||||||
const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i];
|
const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i];
|
||||||
|
|
||||||
const flt_t s1 = x[i].x * hx_inv;
|
const flt_t s1 = x[i].x * hx_inv;
|
||||||
const flt_t s2 = x[i].y * hy_inv;
|
const flt_t s2 = x[i].y * hy_inv;
|
||||||
const flt_t s3 = x[i].z * hz_inv;
|
const flt_t s3 = x[i].z * hz_inv;
|
||||||
@ -950,16 +950,16 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||||||
sf += fsf_coeff1 * sin(ffour_pi * s1);
|
sf += fsf_coeff1 * sin(ffour_pi * s1);
|
||||||
sf *= twoqsq;
|
sf *= twoqsq;
|
||||||
f[i].x += qfactor * particle_ekx[i] - fqqrd2es * sf;
|
f[i].x += qfactor * particle_ekx[i] - fqqrd2es * sf;
|
||||||
|
|
||||||
sf = fsf_coeff2 * sin(ftwo_pi * s2);
|
sf = fsf_coeff2 * sin(ftwo_pi * s2);
|
||||||
sf += fsf_coeff3 * sin(ffour_pi * s2);
|
sf += fsf_coeff3 * sin(ffour_pi * s2);
|
||||||
sf *= twoqsq;
|
sf *= twoqsq;
|
||||||
f[i].y += qfactor * particle_eky[i] - fqqrd2es * sf;
|
f[i].y += qfactor * particle_eky[i] - fqqrd2es * sf;
|
||||||
|
|
||||||
sf = fsf_coeff4 * sin(ftwo_pi * s3);
|
sf = fsf_coeff4 * sin(ftwo_pi * s3);
|
||||||
sf += fsf_coeff5 * sin(ffour_pi * s3);
|
sf += fsf_coeff5 * sin(ffour_pi * s3);
|
||||||
sf *= twoqsq;
|
sf *= twoqsq;
|
||||||
|
|
||||||
if (slabflag != 2) f[i].z += qfactor * particle_ekz[i] - fqqrd2es * sf;
|
if (slabflag != 2) f[i].z += qfactor * particle_ekz[i] - fqqrd2es * sf;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1000,7 +1000,7 @@ void PPPMIntel::poisson_ik_intel()
|
|||||||
n = 0;
|
n = 0;
|
||||||
for (i = 0; i < nfft; i++) {
|
for (i = 0; i < nfft; i++) {
|
||||||
eng = s2 * greensfn[i] * (work1[n]*work1[n] +
|
eng = s2 * greensfn[i] * (work1[n]*work1[n] +
|
||||||
work1[n+1]*work1[n+1]);
|
work1[n+1]*work1[n+1]);
|
||||||
for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
|
for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
|
||||||
if (eflag_global) energy += eng;
|
if (eflag_global) energy += eng;
|
||||||
n += 2;
|
n += 2;
|
||||||
@ -1069,10 +1069,10 @@ void PPPMIntel::poisson_ik_intel()
|
|||||||
for (j = nylo_in; j <= nyhi_in; j++)
|
for (j = nylo_in; j <= nyhi_in; j++)
|
||||||
for (i = nxlo_in; i <= nxhi_in; i++) {
|
for (i = nxlo_in; i <= nxhi_in; i++) {
|
||||||
vdxy_brick[k][j][2*i] = work2[n];
|
vdxy_brick[k][j][2*i] = work2[n];
|
||||||
vdxy_brick[k][j][2*i+1] = work3[n];
|
vdxy_brick[k][j][2*i+1] = work3[n];
|
||||||
n += 2;
|
n += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
// z direction gradient
|
// z direction gradient
|
||||||
|
|
||||||
n = 0;
|
n = 0;
|
||||||
@ -1091,7 +1091,7 @@ void PPPMIntel::poisson_ik_intel()
|
|||||||
for (j = nylo_in; j <= nyhi_in; j++)
|
for (j = nylo_in; j <= nyhi_in; j++)
|
||||||
for (i = nxlo_in; i <= nxhi_in; i++) {
|
for (i = nxlo_in; i <= nxhi_in; i++) {
|
||||||
vdz0_brick[k][j][2*i] = work2[n];
|
vdz0_brick[k][j][2*i] = work2[n];
|
||||||
vdz0_brick[k][j][2*i+1] = 0.;
|
vdz0_brick[k][j][2*i+1] = 0.;
|
||||||
n += 2;
|
n += 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1202,7 +1202,7 @@ double PPPMIntel::memory_usage()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (_use_packing) {
|
if (_use_packing) {
|
||||||
bytes += 2 * (nzhi_out + 2 - nzlo_out + 1) * (nyhi_out - nylo_out + 1)
|
bytes += 2 * (nzhi_out + 2 - nzlo_out + 1) * (nyhi_out - nylo_out + 1)
|
||||||
* (2 * nxhi_out + 1 - 2 * nxlo_out + 1) * sizeof(FFT_SCALAR);
|
* (2 * nxhi_out + 1 - 2 * nxlo_out + 1) * sizeof(FFT_SCALAR);
|
||||||
bytes -= 3 * (nxhi_out - nxlo_out + 1) * (nyhi_out - nylo_out + 1)
|
bytes -= 3 * (nxhi_out - nxlo_out + 1) * (nyhi_out - nylo_out + 1)
|
||||||
* (nzhi_out - nzlo_out + 1) * sizeof(FFT_SCALAR);
|
* (nzhi_out - nzlo_out + 1) * sizeof(FFT_SCALAR);
|
||||||
@ -1228,7 +1228,7 @@ void PPPMIntel::pack_buffers()
|
|||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
|
||||||
packthreads,
|
packthreads,
|
||||||
sizeof(IntelBuffers<float,double>::atom_t));
|
sizeof(IntelBuffers<float,double>::atom_t));
|
||||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||||
fix->get_mixed_buffers()->thr_pack(ifrom,ito,1);
|
fix->get_mixed_buffers()->thr_pack(ifrom,ito,1);
|
||||||
|
|||||||
@ -14,7 +14,7 @@
|
|||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
Contributing authors: William McDoniel (RWTH Aachen University)
|
Contributing authors: William McDoniel (RWTH Aachen University)
|
||||||
Rodrigo Canales (RWTH Aachen University)
|
Rodrigo Canales (RWTH Aachen University)
|
||||||
Markus Hoehnerbach (RWTH Aachen University)
|
Markus Hoehnerbach (RWTH Aachen University)
|
||||||
W. Michael Brown (Intel)
|
W. Michael Brown (Intel)
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
@ -77,7 +77,7 @@ class PPPMIntel : public PPPM {
|
|||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
void test_function(IntelBuffers<flt_t,acc_t> *buffers);
|
void test_function(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
|
||||||
|
|
||||||
void precompute_rho();
|
void precompute_rho();
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
void particle_map(IntelBuffers<flt_t,acc_t> *buffers);
|
void particle_map(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
|||||||
@ -51,7 +51,7 @@ VerletLRTIntel::VerletLRTIntel(LAMMPS *lmp, int narg, char **arg) :
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
VerletLRTIntel::~VerletLRTIntel()
|
VerletLRTIntel::~VerletLRTIntel()
|
||||||
{
|
{
|
||||||
#if defined(_LMP_INTEL_LRT_PTHREAD)
|
#if defined(_LMP_INTEL_LRT_PTHREAD)
|
||||||
pthread_mutex_destroy(&_kmutex);
|
pthread_mutex_destroy(&_kmutex);
|
||||||
@ -67,10 +67,10 @@ void VerletLRTIntel::init()
|
|||||||
Verlet::init();
|
Verlet::init();
|
||||||
|
|
||||||
_intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0));
|
_intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0));
|
||||||
|
|
||||||
#ifdef LMP_INTEL_NOLRT
|
#ifdef LMP_INTEL_NOLRT
|
||||||
error->all(FLERR,
|
error->all(FLERR,
|
||||||
"LRT otion for Intel package disabled at compile time");
|
"LRT otion for Intel package disabled at compile time");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ void VerletLRTIntel::setup(int flag)
|
|||||||
if (_intel_kspace == 0) {
|
if (_intel_kspace == 0) {
|
||||||
Verlet::setup(flag);
|
Verlet::setup(flag);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (_intel_kspace->use_base()) {
|
if (_intel_kspace->use_base()) {
|
||||||
@ -154,15 +154,15 @@ void VerletLRTIntel::setup(int flag)
|
|||||||
_intel_kspace->setup();
|
_intel_kspace->setup();
|
||||||
|
|
||||||
#if defined(_LMP_INTEL_LRT_PTHREAD)
|
#if defined(_LMP_INTEL_LRT_PTHREAD)
|
||||||
pthread_create(&_kspace_thread, &_kspace_attr,
|
pthread_create(&_kspace_thread, &_kspace_attr,
|
||||||
&VerletLRTIntel::k_launch_loop, this);
|
&VerletLRTIntel::k_launch_loop, this);
|
||||||
#elif defined(_LMP_INTEL_LRT_11)
|
#elif defined(_LMP_INTEL_LRT_11)
|
||||||
std::thread kspace_thread;
|
std::thread kspace_thread;
|
||||||
if (kspace_compute_flag)
|
if (kspace_compute_flag)
|
||||||
_kspace_thread=std::thread([=]{ _intel_kspace->compute_first(eflag,
|
_kspace_thread=std::thread([=]{ _intel_kspace->compute_first(eflag,
|
||||||
vflag); });
|
vflag); });
|
||||||
else
|
else
|
||||||
_kspace_thread=std::thread([=]{ _intel_kspace->compute_dummy(eflag,
|
_kspace_thread=std::thread([=]{ _intel_kspace->compute_dummy(eflag,
|
||||||
vflag); });
|
vflag); });
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -297,8 +297,8 @@ void VerletLRTIntel::run(int n)
|
|||||||
pthread_mutex_unlock(&_kmutex);
|
pthread_mutex_unlock(&_kmutex);
|
||||||
#elif defined(_LMP_INTEL_LRT_11)
|
#elif defined(_LMP_INTEL_LRT_11)
|
||||||
std::thread kspace_thread;
|
std::thread kspace_thread;
|
||||||
if (kspace_compute_flag)
|
if (kspace_compute_flag)
|
||||||
kspace_thread=std::thread([=] {
|
kspace_thread=std::thread([=] {
|
||||||
_intel_kspace->compute_first(eflag, vflag);
|
_intel_kspace->compute_first(eflag, vflag);
|
||||||
timer->stamp(Timer::KSPACE);
|
timer->stamp(Timer::KSPACE);
|
||||||
} );
|
} );
|
||||||
@ -329,7 +329,7 @@ void VerletLRTIntel::run(int n)
|
|||||||
_kspace_done = 0;
|
_kspace_done = 0;
|
||||||
pthread_mutex_unlock(&_kmutex);
|
pthread_mutex_unlock(&_kmutex);
|
||||||
#elif defined(_LMP_INTEL_LRT_11)
|
#elif defined(_LMP_INTEL_LRT_11)
|
||||||
if (kspace_compute_flag)
|
if (kspace_compute_flag)
|
||||||
kspace_thread.join();
|
kspace_thread.join();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -367,7 +367,7 @@ void VerletLRTIntel::run(int n)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if defined(_LMP_INTEL_LRT_PTHREAD)
|
#if defined(_LMP_INTEL_LRT_PTHREAD)
|
||||||
if (run_cancelled)
|
if (run_cancelled)
|
||||||
pthread_cancel(_kspace_thread);
|
pthread_cancel(_kspace_thread);
|
||||||
else {
|
else {
|
||||||
pthread_mutex_lock(&_kmutex);
|
pthread_mutex_lock(&_kmutex);
|
||||||
@ -390,9 +390,9 @@ void * VerletLRTIntel::k_launch_loop(void *context)
|
|||||||
{
|
{
|
||||||
VerletLRTIntel * const c = (VerletLRTIntel *)context;
|
VerletLRTIntel * const c = (VerletLRTIntel *)context;
|
||||||
|
|
||||||
if (c->kspace_compute_flag)
|
if (c->kspace_compute_flag)
|
||||||
c->_intel_kspace->compute_first(c->eflag, c->vflag);
|
c->_intel_kspace->compute_first(c->eflag, c->vflag);
|
||||||
else
|
else
|
||||||
c->_intel_kspace->compute_dummy(c->eflag, c->vflag);
|
c->_intel_kspace->compute_dummy(c->eflag, c->vflag);
|
||||||
|
|
||||||
pthread_mutex_lock(&(c->_kmutex));
|
pthread_mutex_lock(&(c->_kmutex));
|
||||||
@ -408,7 +408,7 @@ void * VerletLRTIntel::k_launch_loop(void *context)
|
|||||||
pthread_mutex_unlock(&(c->_kmutex));
|
pthread_mutex_unlock(&(c->_kmutex));
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
|
|
||||||
if (c->kspace_compute_flag) {
|
if (c->kspace_compute_flag) {
|
||||||
c->_intel_kspace->compute_first(c->eflag, c->vflag);
|
c->_intel_kspace->compute_first(c->eflag, c->vflag);
|
||||||
c->timer->stamp(Timer::KSPACE);
|
c->timer->stamp(Timer::KSPACE);
|
||||||
|
|||||||
Reference in New Issue
Block a user