Fixing bug in pppm/intel for AVX-512 with single precision and ik diff.

This commit simplifies the code by eliminating packing optimizations that were slower
under typical parameters and had some memory bugs.
This commit is contained in:
Michael Brown
2017-10-08 08:33:07 -07:00
parent f2c1172741
commit 7e58f084d2
2 changed files with 19 additions and 284 deletions

View File

@ -66,11 +66,7 @@ PPPMIntel::PPPMIntel(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg)
rho_lookup = drho_lookup = NULL;
rho_points = 0;
vdxy_brick = vdz0_brick = NULL;
work3 = NULL;
cg_pack = NULL;
_use_table = _use_packing = _use_lrt = 0;
_use_table = _use_lrt = 0;
}
PPPMIntel::~PPPMIntel()
@ -82,12 +78,6 @@ PPPMIntel::~PPPMIntel()
memory->destroy(rho_lookup);
memory->destroy(drho_lookup);
memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
memory->destroy(work3);
delete cg_pack;
}
/* ----------------------------------------------------------------------
@ -141,36 +131,6 @@ void PPPMIntel::init()
if (order > INTEL_P3M_MAXORDER)
error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n");
_use_packing = (order == 7) && (INTEL_VECTOR_WIDTH == 16)
&& (sizeof(FFT_SCALAR) == sizeof(float))
&& (differentiation_flag == 0);
if (_use_packing) {
memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
"pppmintel:vdxy_brick");
memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
"pppmintel:vdz0_brick");
memory->destroy(work3);
memory->create(work3, 2*nfft_both, "pppmintel:work3");
// new communicator for the double-size bricks
delete cg_pack;
int (*procneigh)[2] = comm->procneigh;
cg_pack = new GridComm(lmp,world,2,0, 2*nxlo_in,2*nxhi_in+1,nylo_in,
nyhi_in,nzlo_in,nzhi_in, 2*nxlo_out,2*nxhi_out+1,
nylo_out,nyhi_out,nzlo_out,nzhi_out,
procneigh[0][0],procneigh[0][1],procneigh[1][0],
procneigh[1][1],procneigh[2][0],procneigh[2][1]);
cg_pack->ghost_notify();
cg_pack->setup();
}
}
/* ----------------------------------------------------------------------
@ -272,18 +232,13 @@ void PPPMIntel::compute_first(int eflag, int vflag)
// also performs per-atom calculations via poisson_peratom()
if (differentiation_flag == 1) poisson_ad();
else poisson_ik_intel();
else poisson_ik();
// all procs communicate E-field values
// to fill ghost cells surrounding their 3d bricks
if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
else {
if (_use_packing)
cg_pack->forward_comm(this,FORWARD_IK);
else
cg->forward_comm(this,FORWARD_IK);
}
else cg->forward_comm(this,FORWARD_IK);
// extra per-atom energy/virial communication
@ -604,7 +559,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
interpolate from grid to get electric field & force on my particles for ik
------------------------------------------------------------------------- */
template<class flt_t, class acc_t, int use_table, int use_packing>
template<class flt_t, class acc_t, int use_table>
void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
{
// loop over my charges, interpolate electric field from nearby grid points
@ -649,9 +604,9 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
int ny = part2grid[i][1];
int nz = part2grid[i][2];
int nxsum = (use_packing ? 2 : 1) * (nx + nlower);
int nxsum = nx + nlower;
int nysum = ny + nlower;
int nzsum = nz + nlower;;
int nzsum = nz + nlower;
FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
@ -668,12 +623,7 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
#pragma simd
#endif
for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
if (use_packing) {
rho0[2 * k] = rho_lookup[idx][k];
rho0[2 * k + 1] = rho_lookup[idx][k];
} else {
rho0[k] = rho_lookup[idx][k];
}
rho0[k] = rho_lookup[idx][k];
rho1[k] = rho_lookup[idy][k];
rho2[k] = rho_lookup[idz][k];
}
@ -690,12 +640,7 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
r2 = rho_coeff[l][k] + r2*dy;
r3 = rho_coeff[l][k] + r3*dz;
}
if (use_packing) {
rho0[2 * (k-nlower)] = r1;
rho0[2 * (k-nlower) + 1] = r1;
} else {
rho0[k-nlower] = r1;
}
rho0[k-nlower] = r1;
rho1[k-nlower] = r2;
rho2[k-nlower] = r3;
}
@ -722,18 +667,12 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#endif
for (int l = 0; l < (use_packing ? 2 : 1) *
INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
int mx = l+nxsum;
FFT_SCALAR x0 = y0*rho0[l];
if (use_packing) {
ekxy_arr[l] -= x0*vdxy_brick[mz][my][mx];
ekz0_arr[l] -= x0*vdz0_brick[mz][my][mx];
} else {
ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
eky_arr[l] -= x0*vdy_brick[mz][my][mx];
ekz_arr[l] -= x0*vdz_brick[mz][my][mx];
}
ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
eky_arr[l] -= x0*vdy_brick[mz][my][mx];
ekz_arr[l] -= x0*vdz_brick[mz][my][mx];
}
}
}
@ -741,18 +680,10 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR ekx, eky, ekz;
ekx = eky = ekz = ZEROF;
if (use_packing) {
for (int l = 0; l < 2*order; l += 2) {
ekx += ekxy_arr[l];
eky += ekxy_arr[l+1];
ekz += ekz0_arr[l];
}
} else {
for (int l = 0; l < order; l++) {
ekx += ekx_arr[l];
eky += eky_arr[l];
ekz += ekz_arr[l];
}
for (int l = 0; l < order; l++) {
ekx += ekx_arr[l];
eky += eky_arr[l];
ekz += ekz_arr[l];
}
// convert E-field to force
@ -965,137 +896,6 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
}
}
/* ----------------------------------------------------------------------
FFT-based Poisson solver for ik
Does special things for packing mode to avoid repeated copies
------------------------------------------------------------------------- */
void PPPMIntel::poisson_ik_intel()
{
if (_use_packing == 0) {
poisson_ik();
return;
}
int i,j,k,n;
double eng;
// transform charge density (r -> k)
n = 0;
for (i = 0; i < nfft; i++) {
work1[n++] = density_fft[i];
work1[n++] = ZEROF;
}
fft1->compute(work1,work1,1);
// global energy and virial contribution
double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
double s2 = scaleinv*scaleinv;
if (eflag_global || vflag_global) {
if (vflag_global) {
n = 0;
for (i = 0; i < nfft; i++) {
eng = s2 * greensfn[i] * (work1[n]*work1[n] +
work1[n+1]*work1[n+1]);
for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
if (eflag_global) energy += eng;
n += 2;
}
} else {
n = 0;
for (i = 0; i < nfft; i++) {
energy +=
s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
n += 2;
}
}
}
// scale by 1/total-grid-pts to get rho(k)
// multiply by Green's function to get V(k)
n = 0;
for (i = 0; i < nfft; i++) {
work1[n++] *= scaleinv * greensfn[i];
work1[n++] *= scaleinv * greensfn[i];
}
// extra FFTs for per-atom energy/virial
if (evflag_atom) poisson_peratom();
// triclinic system
if (triclinic) {
poisson_ik_triclinic();
return;
}
// compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
// FFT leaves data in 3d brick decomposition
// copy it into inner portion of vdx,vdy,vdz arrays
// x direction gradient
n = 0;
for (k = nzlo_fft; k <= nzhi_fft; k++)
for (j = nylo_fft; j <= nyhi_fft; j++)
for (i = nxlo_fft; i <= nxhi_fft; i++) {
work2[n] = fkx[i]*work1[n+1];
work2[n+1] = -fkx[i]*work1[n];
n += 2;
}
fft2->compute(work2,work2,-1);
// y direction gradient
n = 0;
for (k = nzlo_fft; k <= nzhi_fft; k++)
for (j = nylo_fft; j <= nyhi_fft; j++)
for (i = nxlo_fft; i <= nxhi_fft; i++) {
work3[n] = fky[j]*work1[n+1];
work3[n+1] = -fky[j]*work1[n];
n += 2;
}
fft2->compute(work3,work3,-1);
n = 0;
for (k = nzlo_in; k <= nzhi_in; k++)
for (j = nylo_in; j <= nyhi_in; j++)
for (i = nxlo_in; i <= nxhi_in; i++) {
vdxy_brick[k][j][2*i] = work2[n];
vdxy_brick[k][j][2*i+1] = work3[n];
n += 2;
}
// z direction gradient
n = 0;
for (k = nzlo_fft; k <= nzhi_fft; k++)
for (j = nylo_fft; j <= nyhi_fft; j++)
for (i = nxlo_fft; i <= nxhi_fft; i++) {
work2[n] = fkz[k]*work1[n+1];
work2[n+1] = -fkz[k]*work1[n];
n += 2;
}
fft2->compute(work2,work2,-1);
n = 0;
for (k = nzlo_in; k <= nzhi_in; k++)
for (j = nylo_in; j <= nyhi_in; j++)
for (i = nxlo_in; i <= nxhi_in; i++) {
vdz0_brick[k][j][2*i] = work2[n];
vdz0_brick[k][j][2*i+1] = 0.;
n += 2;
}
}
/* ----------------------------------------------------------------------
precompute rho coefficients as a lookup table to save time in make_rho
and fieldforce. Instead of doing this polynomial for every atom 6 times
@ -1141,46 +941,6 @@ void PPPMIntel::precompute_rho()
}
}
/* ----------------------------------------------------------------------
pack own values to buf to send to another proc
------------------------------------------------------------------------- */
void PPPMIntel::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
{
int n = 0;
if ((flag == FORWARD_IK) && _use_packing) {
FFT_SCALAR *xsrc = &vdxy_brick[nzlo_out][nylo_out][2*nxlo_out];
FFT_SCALAR *zsrc = &vdz0_brick[nzlo_out][nylo_out][2*nxlo_out];
for (int i = 0; i < nlist; i++) {
buf[n++] = xsrc[list[i]];
buf[n++] = zsrc[list[i]];
}
} else {
PPPM::pack_forward(flag, buf, nlist, list);
}
}
/* ----------------------------------------------------------------------
unpack another proc's own values from buf and set own ghost values
------------------------------------------------------------------------- */
void PPPMIntel::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
{
int n = 0;
if ((flag == FORWARD_IK) && _use_packing) {
FFT_SCALAR *xdest = &vdxy_brick[nzlo_out][nylo_out][2*nxlo_out];
FFT_SCALAR *zdest = &vdz0_brick[nzlo_out][nylo_out][2*nxlo_out];
for (int i = 0; i < nlist; i++) {
xdest[list[i]] = buf[n++];
zdest[list[i]] = buf[n++];
}
} else {
PPPM::unpack_forward(flag, buf, nlist, list);
}
}
/* ----------------------------------------------------------------------
memory usage of local arrays
------------------------------------------------------------------------- */
@ -1201,14 +961,6 @@ double PPPMIntel::memory_usage()
bytes += rho_points * INTEL_P3M_ALIGNED_MAXORDER * sizeof(FFT_SCALAR);
}
}
if (_use_packing) {
bytes += 2 * (nzhi_out + 2 - nzlo_out + 1) * (nyhi_out - nylo_out + 1)
* (2 * nxhi_out + 1 - 2 * nxlo_out + 1) * sizeof(FFT_SCALAR);
bytes -= 3 * (nxhi_out - nxlo_out + 1) * (nyhi_out - nylo_out + 1)
* (nzhi_out - nzlo_out + 1) * sizeof(FFT_SCALAR);
bytes += 2 * nfft_both * sizeof(FFT_SCALAR);
bytes += cg_pack->memory_usage();
}
return bytes;
}

View File

@ -38,8 +38,6 @@ class PPPMIntel : public PPPM {
virtual ~PPPMIntel();
virtual void init();
virtual void compute(int, int);
virtual void pack_forward(int, FFT_SCALAR *, int, int *);
virtual void unpack_forward(int, FFT_SCALAR *, int, int *);
virtual double memory_usage();
void compute_first(int, int);
void compute_second(int, int);
@ -64,12 +62,6 @@ class PPPMIntel : public PPPM {
FFT_SCALAR **drho_lookup;
FFT_SCALAR half_rho_scale, half_rho_scale_plus;
int _use_packing;
FFT_SCALAR ***vdxy_brick;
FFT_SCALAR ***vdz0_brick;
FFT_SCALAR *work3;
class GridComm *cg_pack;
#ifdef _LMP_INTEL_OFFLOAD
int _use_base;
#endif
@ -92,23 +84,14 @@ class PPPMIntel : public PPPM {
make_rho<flt_t,acc_t,0>(buffers);
}
}
void poisson_ik_intel();
template<class flt_t, class acc_t, int use_table, int use_packing>
template<class flt_t, class acc_t, int use_table>
void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers);
template<class flt_t, class acc_t>
void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers) {
if (_use_table == 1) {
if (_use_packing == 1) {
fieldforce_ik<flt_t, acc_t, 1, 1>(buffers);
} else {
fieldforce_ik<flt_t, acc_t, 1, 0>(buffers);
}
fieldforce_ik<flt_t, acc_t, 1>(buffers);
} else {
if (_use_packing == 1) {
fieldforce_ik<flt_t, acc_t, 0, 1>(buffers);
} else {
fieldforce_ik<flt_t, acc_t, 0, 0>(buffers);
}
fieldforce_ik<flt_t, acc_t, 0>(buffers);
}
}
template<class flt_t, class acc_t, int use_table>