git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@9044 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2012-11-09 18:53:01 +00:00
parent ac51e677ab
commit d8efe2a7d3
4 changed files with 220 additions and 491 deletions
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
@ -24,6 +24,7 @@
 #include "pppm_gpu.h"
 #include "atom.h"
 #include "comm.h"
 #include "commgrid.h"
 #include "neighbor.h"
 #include "force.h"
 #include "pair.h"
@ -48,6 +49,9 @@ using namespace MathConst;
 #define LARGE 10000.0
 #define EPS_HOC 1.0e-7
 enum{REVERSE_RHO};
 enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM};
 #ifdef FFT_SINGLE
 #define ZEROF 0.0f
 #define ONEF  1.0f
@ -56,7 +60,7 @@ using namespace MathConst;
 #define ONEF  1.0
 #endif
-// External functions from cuda library for atom decomposition
+// external functions from cuda library for atom decomposition
 #ifdef FFT_SINGLE
 #define PPPM_GPU_API(api)  pppm_gpu_ ## api ## _f
@ -111,10 +115,35 @@ PPPMGPU::~PPPMGPU()
 void PPPMGPU::init()
 {
  // PPPM init manages all arrays except density_brick_gpu and vd_brick
  //      thru its deallocate(), allocate()
  // NOTE: could free density_brick and vdxyz_brick after PPPM allocates them,
  //       before allocating db_gpu and vd_brick down below, if don't need,
  //       if do this, make sure to set them to NULL
  // NOTE: delete/realloc of cg necessary b/c packing 4 values per grid pt,
  //       not 3 as PPPM does - probably a better way to account for this
  //       in PPPM::init()
  destroy_3d_offset(density_brick_gpu,nzlo_out,nylo_out);
  destroy_3d_offset(vd_brick,nzlo_out,nylo_out);
  density_brick_gpu = vd_brick = NULL;
  PPPM::init();
  if (differentiation_flag == 0) {
    delete cg;
    int (*procneigh)[2] = comm->procneigh;
    cg = new CommGrid(lmp,world,4,1,
                      nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                      nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
                      procneigh[0][0],procneigh[0][1],procneigh[1][0],
                      procneigh[1][1],procneigh[2][0],procneigh[2][1]);
  }
  // unsupported option
  if (differentiation_flag == 1)
-    error->all(FLERR,"Cannot (yet) do analytic differentiation with pppm/gpu.");
+    error->all(FLERR,"Cannot (yet) do analytic differentiation with pppm/gpu");
  if (strcmp(update->integrate_style,"verlet/split") == 0) {
    kspace_split=true;
@ -142,6 +171,8 @@ void PPPMGPU::init()
  GPU_EXTRA::check_flag(success,error,world);
  // allocate density_brick_gpu and vd_brick
  density_brick_gpu =
    create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
                     nxlo_out,nxhi_out,"pppm:density_brick_gpu",h_brick,1);
@ -149,7 +180,7 @@ void PPPMGPU::init()
    create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
                     nxlo_out,nxhi_out,"pppm:vd_brick",data,4);
-  poisson_time=0;
+  poisson_time = 0.0;
 }
 /* ----------------------------------------------------------------------
@ -164,10 +195,8 @@ void PPPMGPU::compute(int eflag, int vflag)
    if (atom->nlocal > old_nlocal) {
      nago=0;
      old_nlocal = atom->nlocal;
-    } else
+    } else nago = 1;
-      nago=1;
+  } else nago = neighbor->ago;
  } else
    nago=neighbor->ago;
  // set energy/virial flags
  // invoke allocate_peratom() if needed for first time
@ -217,6 +246,7 @@ void PPPMGPU::compute(int eflag, int vflag)
  //   to fully sum contribution in their 3d bricks
  // remap from 3d decomposition to FFT decomposition
  cg->reverse_comm(this,REVERSE_RHO);
  brick2fft();
  // compute potential gradient on my FFT grid and
@ -225,10 +255,20 @@ void PPPMGPU::compute(int eflag, int vflag)
  poisson();
-  // all procs communicate E-field values to fill ghost cells
+  // all procs communicate E-field values
-  //   surrounding their 3d bricks
+  // to fill ghost cells surrounding their 3d bricks
-  fillbrick();
+  if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
  else cg->forward_comm(this,FORWARD_IK);
  // extra per-atom energy/virial communication
  if (evflag_atom) {
    if (differentiation_flag == 1 && vflag_atom) 
      cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
    else if (differentiation_flag == 0)
      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
  }
  poisson_time += MPI_Wtime()-t3;
@ -237,11 +277,10 @@ void PPPMGPU::compute(int eflag, int vflag)
  FFT_SCALAR qscale = force->qqrd2e * scale;
  PPPM_GPU_API(interp)(qscale);
-  // Compute per-atom energy/virial on host if requested
+  // per-atom energy/virial
  // energy includes self-energy correction
  if (evflag_atom) {
    fillbrick_peratom();
    fieldforce_peratom();
    double *q = atom->q;
    int nlocal = atom->nlocal;
@ -293,246 +332,13 @@ void PPPMGPU::compute(int eflag, int vflag)
 }
 /* ----------------------------------------------------------------------
   allocate memory that depends on # of K-vectors and order
 ------------------------------------------------------------------------- */
 void PPPMGPU::allocate()
 {
  memory->create(density_fft,nfft_both,"pppm:density_fft");
  memory->create(greensfn,nfft_both,"pppm:greensfn");
  memory->create(work1,2*nfft_both,"pppm:work1");
  memory->create(work2,2*nfft_both,"pppm:work2");
  memory->create(vg,nfft_both,6,"pppm:vg");
  memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm:fkx");
  memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm:fky");
  memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm:fkz");
  memory->create(buf1,nbuf,"pppm:buf1");
  memory->create(buf2,nbuf,"pppm:buf2");
  // summation coeffs
  memory->create(gf_b,order,"pppm:gf_b");
  memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d");
  memory->create2d_offset(drho1d,3,-order/2,order/2,"pppm:drho1d");
  memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff");
  memory->create2d_offset(drho_coeff,order,(1-order)/2,order/2,
                          "pppm:drho_coeff");
  // create 2 FFTs and a Remap
  // 1st FFT keeps data in FFT decompostion
  // 2nd FFT returns data in 3d brick decomposition
  // remap takes data from 3d brick to FFT decomposition
  int tmp;
  fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                   0,0,&tmp);
  fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                   0,0,&tmp);
  remap = new Remap(lmp,world,
                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                    1,0,0,FFT_PRECISION);
 }
 /* ----------------------------------------------------------------------
   deallocate memory that depends on # of K-vectors and order
 ------------------------------------------------------------------------- */
 void PPPMGPU::deallocate()
 {
  destroy_3d_offset(density_brick_gpu,nzlo_out,nylo_out);
  destroy_3d_offset(vd_brick,nzlo_out,nylo_out);
  memory->destroy(density_fft);
  memory->destroy(greensfn);
  memory->destroy(work1);
  memory->destroy(work2);
  memory->destroy(vg);
  memory->destroy1d_offset(fkx,nxlo_fft);
  memory->destroy1d_offset(fky,nylo_fft);
  memory->destroy1d_offset(fkz,nzlo_fft);
  memory->destroy(buf1);
  memory->destroy(buf2);
  memory->destroy(gf_b);
  memory->destroy2d_offset(rho1d,-order/2);
  memory->destroy2d_offset(drho1d,-order/2);
  memory->destroy2d_offset(rho_coeff,(1-order)/2);
  memory->destroy2d_offset(drho_coeff,(1-order)/2);
  delete fft1;
  delete fft2;
  delete remap;
 }
 /* ----------------------------------------------------------------------
   ghost-swap to accumulate full density in brick decomposition
   remap density from 3d brick decomposition to FFT decomposition
 ------------------------------------------------------------------------- */
 void PPPMGPU::brick2fft()
 {
-  int i,n,ix,iy,iz;
+  int n,ix,iy,iz;
  MPI_Request request;
  MPI_Status status;
  // pack my ghosts for +x processor
  // pass data to self or +x processor
  // unpack and sum recv data into my real cells
  n = 0;
  for (iz = nzlo_out; iz <= nzhi_out; iz++)
    for (iy = nylo_out; iy <= nyhi_out; iy++)
      for (ix = nxhi_in+1; ix <= nxhi_out; ix++)
        buf1[n++] = density_brick_gpu[iz][iy][ix];
  if (comm->procneigh[0][1] == me)
    for (i = 0; i < n; i++) buf2[i] = buf1[i];
  else {
    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world,&request);
    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world);
    MPI_Wait(&request,&status);
  }
  n = 0;
  for (iz = nzlo_out; iz <= nzhi_out; iz++)
    for (iy = nylo_out; iy <= nyhi_out; iy++)
      for (ix = nxlo_in; ix < nxlo_in+nxlo_ghost; ix++)
        density_brick_gpu[iz][iy][ix] += buf2[n++];
  // pack my ghosts for -x processor
  // pass data to self or -x processor
  // unpack and sum recv data into my real cells
  n = 0;
  for (iz = nzlo_out; iz <= nzhi_out; iz++)
    for (iy = nylo_out; iy <= nyhi_out; iy++)
      for (ix = nxlo_out; ix < nxlo_in; ix++)
        buf1[n++] = density_brick_gpu[iz][iy][ix];
  if (comm->procneigh[0][0] == me)
    for (i = 0; i < n; i++) buf2[i] = buf1[i];
  else {
    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world,&request);
    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world);
    MPI_Wait(&request,&status);
  }
  n = 0;
  for (iz = nzlo_out; iz <= nzhi_out; iz++)
    for (iy = nylo_out; iy <= nyhi_out; iy++)
      for (ix = nxhi_in-nxhi_ghost+1; ix <= nxhi_in; ix++)
        density_brick_gpu[iz][iy][ix] += buf2[n++];
  // pack my ghosts for +y processor
  // pass data to self or +y processor
  // unpack and sum recv data into my real cells
  n = 0;
  for (iz = nzlo_out; iz <= nzhi_out; iz++)
    for (iy = nyhi_in+1; iy <= nyhi_out; iy++)
      for (ix = nxlo_in; ix <= nxhi_in; ix++)
        buf1[n++] = density_brick_gpu[iz][iy][ix];
  if (comm->procneigh[1][1] == me)
    for (i = 0; i < n; i++) buf2[i] = buf1[i];
  else {
    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world,&request);
    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world);
    MPI_Wait(&request,&status);
  }
  n = 0;
  for (iz = nzlo_out; iz <= nzhi_out; iz++)
    for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++)
      for (ix = nxlo_in; ix <= nxhi_in; ix++)
        density_brick_gpu[iz][iy][ix] += buf2[n++];
  // pack my ghosts for -y processor
  // pass data to self or -y processor
  // unpack and sum recv data into my real cells
  n = 0;
  for (iz = nzlo_out; iz <= nzhi_out; iz++)
    for (iy = nylo_out; iy < nylo_in; iy++)
      for (ix = nxlo_in; ix <= nxhi_in; ix++)
        buf1[n++] = density_brick_gpu[iz][iy][ix];
  if (comm->procneigh[1][0] == me)
    for (i = 0; i < n; i++) buf2[i] = buf1[i];
  else {
    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world,&request);
    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world);
    MPI_Wait(&request,&status);
  }
  n = 0;
  for (iz = nzlo_out; iz <= nzhi_out; iz++)
    for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++)
      for (ix = nxlo_in; ix <= nxhi_in; ix++)
        density_brick_gpu[iz][iy][ix] += buf2[n++];
  // pack my ghosts for +z processor
  // pass data to self or +z processor
  // unpack and sum recv data into my real cells
  n = 0;
  for (iz = nzhi_in+1; iz <= nzhi_out; iz++)
    for (iy = nylo_in; iy <= nyhi_in; iy++)
      for (ix = nxlo_in; ix <= nxhi_in; ix++)
        buf1[n++] = density_brick_gpu[iz][iy][ix];
  if (comm->procneigh[2][1] == me)
    for (i = 0; i < n; i++) buf2[i] = buf1[i];
  else {
    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world,&request);
    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world);
    MPI_Wait(&request,&status);
  }
  n = 0;
  for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++)
    for (iy = nylo_in; iy <= nyhi_in; iy++)
      for (ix = nxlo_in; ix <= nxhi_in; ix++)
        density_brick_gpu[iz][iy][ix] += buf2[n++];
  // pack my ghosts for -z processor
  // pass data to self or -z processor
  // unpack and sum recv data into my real cells
  n = 0;
  for (iz = nzlo_out; iz < nzlo_in; iz++)
    for (iy = nylo_in; iy <= nyhi_in; iy++)
      for (ix = nxlo_in; ix <= nxhi_in; ix++)
        buf1[n++] = density_brick_gpu[iz][iy][ix];
  if (comm->procneigh[2][0] == me)
    for (i = 0; i < n; i++) buf2[i] = buf1[i];
  else {
    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world,&request);
    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world);
    MPI_Wait(&request,&status);
  }
  n = 0;
  for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++)
    for (iy = nylo_in; iy <= nyhi_in; iy++)
      for (ix = nxlo_in; ix <= nxhi_in; ix++)
        density_brick_gpu[iz][iy][ix] += buf2[n++];
  // remap from 3d brick decomposition to FFT decomposition
  // copy grabs inner portion of density from 3d brick
  // remap could be done as pre-stage of FFT,
  //   but this works optimally on only double values, not complex values
@ -546,216 +352,6 @@ void PPPMGPU::brick2fft()
  remap->perform(density_fft,density_fft,work1);
 }
 /* ----------------------------------------------------------------------
   Same as base class - needed to call GPU version of fillbrick_.
 ------------------------------------------------------------------------- */
 void PPPMGPU::fillbrick()
 {
  if (differentiation_flag == 1) fillbrick_ad();
  else fillbrick_ik();
 }
 /* ----------------------------------------------------------------------
   ghost-swap to fill ghost cells of my brick with field values
 ------------------------------------------------------------------------- */
 void PPPMGPU::fillbrick_ik()
 {
  int i,n,ix,iy,iz;
  MPI_Request request;
  MPI_Status status;
  // pack my real cells for +z processor
  // pass data to self or +z processor
  // unpack and sum recv data into my ghost cells
  n = 0;
  int x_lo = nxlo_in * 4;
  int x_hi = nxhi_in * 4 + 1;
  for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++)
    for (iy = nylo_in; iy <= nyhi_in; iy++)
      for (ix = x_lo; ix < x_hi; ix+=4) {
        buf1[n++] = vd_brick[iz][iy][ix];
        buf1[n++] = vd_brick[iz][iy][ix+1];
        buf1[n++] = vd_brick[iz][iy][ix+2];
      }
  if (comm->procneigh[2][1] == me)
    for (i = 0; i < n; i++) buf2[i] = buf1[i];
  else {
    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world,&request);
    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world);
    MPI_Wait(&request,&status);
  }
  n = 0;
  for (iz = nzlo_out; iz < nzlo_in; iz++)
    for (iy = nylo_in; iy <= nyhi_in; iy++)
      for (ix = x_lo; ix < x_hi; ix+=4) {
        vd_brick[iz][iy][ix] = buf2[n++];
        vd_brick[iz][iy][ix+1] = buf2[n++];
        vd_brick[iz][iy][ix+2] = buf2[n++];
      }
  // pack my real cells for -z processor
  // pass data to self or -z processor
  // unpack and sum recv data into my ghost cells
  n = 0;
  for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++)
    for (iy = nylo_in; iy <= nyhi_in; iy++)
      for (ix = x_lo; ix < x_hi; ix+=4) {
        buf1[n++] = vd_brick[iz][iy][ix];
        buf1[n++] = vd_brick[iz][iy][ix+1];
        buf1[n++] = vd_brick[iz][iy][ix+2];
      }
  if (comm->procneigh[2][0] == me)
    for (i = 0; i < n; i++) buf2[i] = buf1[i];
  else {
    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world,&request);
    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world);
    MPI_Wait(&request,&status);
  }
  n = 0;
  for (iz = nzhi_in+1; iz <= nzhi_out; iz++)
    for (iy = nylo_in; iy <= nyhi_in; iy++)
      for (ix = x_lo; ix < x_hi; ix+=4) {
        vd_brick[iz][iy][ix] = buf2[n++];
        vd_brick[iz][iy][ix+1] = buf2[n++];
        vd_brick[iz][iy][ix+2] = buf2[n++];
      }
  // pack my real cells for +y processor
  // pass data to self or +y processor
  // unpack and sum recv data into my ghost cells
  n = 0;
  for (iz = nzlo_out; iz <= nzhi_out; iz++)
    for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++)
      for (ix = x_lo; ix < x_hi; ix+=4) {
        buf1[n++] = vd_brick[iz][iy][ix];
        buf1[n++] = vd_brick[iz][iy][ix+1];
        buf1[n++] = vd_brick[iz][iy][ix+2];
      }
  if (comm->procneigh[1][1] == me)
    for (i = 0; i < n; i++) buf2[i] = buf1[i];
  else {
    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world,&request);
    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world);
    MPI_Wait(&request,&status);
  }
  n = 0;
  for (iz = nzlo_out; iz <= nzhi_out; iz++)
    for (iy = nylo_out; iy < nylo_in; iy++)
      for (ix = x_lo; ix < x_hi; ix+=4) {
        vd_brick[iz][iy][ix] = buf2[n++];
        vd_brick[iz][iy][ix+1] = buf2[n++];
        vd_brick[iz][iy][ix+2] = buf2[n++];
      }
  // pack my real cells for -y processor
  // pass data to self or -y processor
  // unpack and sum recv data into my ghost cells
  n = 0;
  for (iz = nzlo_out; iz <= nzhi_out; iz++)
    for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++)
      for (ix = x_lo; ix < x_hi; ix+=4) {
        buf1[n++] = vd_brick[iz][iy][ix];
        buf1[n++] = vd_brick[iz][iy][ix+1];
        buf1[n++] = vd_brick[iz][iy][ix+2];
      }
  if (comm->procneigh[1][0] == me)
    for (i = 0; i < n; i++) buf2[i] = buf1[i];
  else {
    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world,&request);
    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world);
    MPI_Wait(&request,&status);
  }
  n = 0;
  for (iz = nzlo_out; iz <= nzhi_out; iz++)
    for (iy = nyhi_in+1; iy <= nyhi_out; iy++)
      for (ix = x_lo; ix < x_hi; ix+=4) {
        vd_brick[iz][iy][ix] = buf2[n++];
        vd_brick[iz][iy][ix+1] = buf2[n++];
        vd_brick[iz][iy][ix+2] = buf2[n++];
      }
  // pack my real cells for +x processor
  // pass data to self or +x processor
  // unpack and sum recv data into my ghost cells
  n = 0;
  x_lo = (nxhi_in-nxhi_ghost+1) * 4;
  for (iz = nzlo_out; iz <= nzhi_out; iz++)
    for (iy = nylo_out; iy <= nyhi_out; iy++)
      for (ix = x_lo; ix < x_hi; ix+=4) {
        buf1[n++] = vd_brick[iz][iy][ix];
        buf1[n++] = vd_brick[iz][iy][ix+1];
        buf1[n++] = vd_brick[iz][iy][ix+2];
      }
  if (comm->procneigh[0][1] == me)
    for (i = 0; i < n; i++) buf2[i] = buf1[i];
  else {
    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world,&request);
    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world);
    MPI_Wait(&request,&status);
  }
  n = 0;
  x_lo = nxlo_out * 4;
  x_hi = nxlo_in * 4;
  for (iz = nzlo_out; iz <= nzhi_out; iz++)
    for (iy = nylo_out; iy <= nyhi_out; iy++)
      for (ix = x_lo; ix < x_hi; ix+=4) {
        vd_brick[iz][iy][ix] = buf2[n++];
        vd_brick[iz][iy][ix+1] = buf2[n++];
        vd_brick[iz][iy][ix+2] = buf2[n++];
      }
  // pack my real cells for -x processor
  // pass data to self or -x processor
  // unpack and sum recv data into my ghost cells
  n = 0;
  x_lo = x_hi;
  x_hi = (nxlo_in+nxlo_ghost) * 4;
  for (iz = nzlo_out; iz <= nzhi_out; iz++)
    for (iy = nylo_out; iy <= nyhi_out; iy++)
      for (ix = x_lo; ix < x_hi; ix+=4) {
        buf1[n++] = vd_brick[iz][iy][ix];
        buf1[n++] = vd_brick[iz][iy][ix+1];
        buf1[n++] = vd_brick[iz][iy][ix+2];
      }
  if (comm->procneigh[0][0] == me)
    for (i = 0; i < n; i++) buf2[i] = buf1[i];
  else {
    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world,&request);
    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world);
    MPI_Wait(&request,&status);
  }
  n = 0;
  x_lo = (nxhi_in + 1) * 4;
  x_hi = nxhi_out * 4 + 1;
  for (iz = nzlo_out; iz <= nzhi_out; iz++)
    for (iy = nylo_out; iy <= nyhi_out; iy++)
      for (ix = x_lo; ix < x_hi; ix+=4) {
        vd_brick[iz][iy][ix] = buf2[n++];
        vd_brick[iz][iy][ix+1] = buf2[n++];
        vd_brick[iz][iy][ix+2] = buf2[n++];
      }
 }
 /* ----------------------------------------------------------------------
   Same code as base class - necessary to call GPU version of poisson_ik
 ------------------------------------------------------------------------- */
@ -892,7 +488,149 @@ void PPPMGPU::poisson_ik()
 }
 /* ----------------------------------------------------------------------
-   Create array using offsets from pinned memory allocation
+   pack own values to buf to send to another proc
 ------------------------------------------------------------------------- */
 void PPPMGPU::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
 {
  int n = 0;
  if (flag == FORWARD_IK) {
    int offset;
    FFT_SCALAR *src = &vd_brick[nzlo_out][nylo_out][4*nxlo_out];
    for (int i = 0; i < nlist; i++) {
      offset = 4*list[i];
      buf[n++] = src[offset++];
      buf[n++] = src[offset++];
      buf[n++] = src[offset];
    }
  } else if (flag == FORWARD_AD) {
    FFT_SCALAR *src = &u_brick[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++)
      buf[i] = src[list[i]];
  } else if (flag == FORWARD_IK_PERATOM) {
    FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++) {
      if (eflag_atom) buf[n++] = esrc[list[i]];
      if (vflag_atom) {
        buf[n++] = v0src[list[i]];
        buf[n++] = v1src[list[i]];
        buf[n++] = v2src[list[i]];
        buf[n++] = v3src[list[i]];
        buf[n++] = v4src[list[i]];
        buf[n++] = v5src[list[i]];
      }
    }
  } else if (flag == FORWARD_AD_PERATOM) {
    FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++) {
      buf[n++] = v0src[list[i]];
      buf[n++] = v1src[list[i]];
      buf[n++] = v2src[list[i]];
      buf[n++] = v3src[list[i]];
      buf[n++] = v4src[list[i]];
      buf[n++] = v5src[list[i]];
    }
  }
 }
 /* ----------------------------------------------------------------------
   unpack another proc's own values from buf and set own ghost values
 ------------------------------------------------------------------------- */
 void PPPMGPU::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
 {
  int n = 0;
  if (flag == FORWARD_IK) {
    int offset;
    FFT_SCALAR *dest = &vdx_brick[nzlo_out][nylo_out][4*nxlo_out];
    for (int i = 0; i < nlist; i++) {
      offset = 4*list[i];
      dest[offset++] = buf[n++];
      dest[offset++] = buf[n++];
      dest[offset] = buf[n++];
    }
  } else if (flag == FORWARD_AD) {
    FFT_SCALAR *dest = &u_brick[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++)
      dest[list[i]] = buf[n++];
  } else if (flag == FORWARD_IK_PERATOM) {
    FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++) {
      if (eflag_atom) esrc[list[i]] = buf[n++];
      if (vflag_atom) {
        v0src[list[i]] = buf[n++];
        v1src[list[i]] = buf[n++];
        v2src[list[i]] = buf[n++];
        v3src[list[i]] = buf[n++];
        v4src[list[i]] = buf[n++];
        v5src[list[i]] = buf[n++];
      }
    }
  } else if (flag == FORWARD_AD_PERATOM) {
    FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
    FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++) {
      v0src[list[i]] = buf[n++];
      v1src[list[i]] = buf[n++];
      v2src[list[i]] = buf[n++];
      v3src[list[i]] = buf[n++];
      v4src[list[i]] = buf[n++];
      v5src[list[i]] = buf[n++];
    }
  }
 }
 /* ----------------------------------------------------------------------
   pack ghost values into buf to send to another proc
 ------------------------------------------------------------------------- */
 void PPPMGPU::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
 {
  if (flag == REVERSE_RHO) {
    FFT_SCALAR *src = &density_brick_gpu[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++)
      buf[i] = src[list[i]];
  }
 }
 /* ----------------------------------------------------------------------
   unpack another proc's ghost values from buf and add to own values
 ------------------------------------------------------------------------- */
 void PPPMGPU::unpack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
 {
  if (flag == REVERSE_RHO) {
    FFT_SCALAR *dest = &density_brick_gpu[nzlo_out][nylo_out][nxlo_out];
    for (int i = 0; i < nlist; i++)
      dest[list[i]] += buf[i];
  } 
 }
 /* ----------------------------------------------------------------------
   create array using offsets from pinned memory allocation
 ------------------------------------------------------------------------- */
 FFT_SCALAR ***PPPMGPU::create_3d_offset(int n1lo, int n1hi, int n2lo, int n2hi,
@ -942,19 +680,11 @@ void PPPMGPU::destroy_3d_offset(FFT_SCALAR ***array, int n1_offset,
 double PPPMGPU::memory_usage()
 {
-  double bytes = nmax*3 * sizeof(double);
+  double bytes = PPPM::memory_usage();
  int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
    (nzhi_out-nzlo_out+1);
  bytes += 4 * nbrick * sizeof(FFT_SCALAR);
  bytes += 6 * nfft_both * sizeof(double);
  bytes += nfft_both * sizeof(double);
  bytes += nfft_both*5 * sizeof(FFT_SCALAR);
  bytes += 2 * nbuf * sizeof(double);
-  if (peratom_allocate_flag) {
+  // NOTE: add tallying here for density_brick_gpu and vd_brick
-    bytes += 7 * nbrick * sizeof(FFT_SCALAR);
+  //       could subtract out density_brick and vdxyz_brick if freed them above
-    bytes += 2 * nbuf_peratom * sizeof(FFT_SCALAR);
+  //       it the net efffect is zero, do nothing
  }
  return bytes + PPPM_GPU_API(bytes)();
 }
--- a/src/GPU/pppm_gpu.h
+++ b/src/GPU/pppm_gpu.h
@ -35,20 +35,19 @@ class PPPMGPU : public PPPM {
  double memory_usage();
 protected:
  FFT_SCALAR ***density_brick_gpu, ***vd_brick;
  bool kspace_split, im_real_space;
  int old_nlocal;
  double poisson_time;
  void allocate();
  void deallocate();
  void brick2fft();
  void fillbrick();
  void fillbrick_ik();
  void poisson();
  void poisson_ik();
-  int old_nlocal;
+  void pack_forward(int, FFT_SCALAR *, int, int *);
-  double poisson_time;
+  void unpack_forward(int, FFT_SCALAR *, int, int *);
  void pack_reverse(int, FFT_SCALAR *, int, int *);
  void unpack_reverse(int, FFT_SCALAR *, int, int *);
  FFT_SCALAR ***create_3d_offset(int, int, int, int, int, int, const char *,
                                 FFT_SCALAR *, int);
--- a/src/KSPACE/pppm.h
+++ b/src/KSPACE/pppm.h
@ -150,10 +150,10 @@ class PPPM : public KSpace {
  // grid communication
-  void pack_forward(int, FFT_SCALAR *, int, int *);
+  virtual void pack_forward(int, FFT_SCALAR *, int, int *);
-  void unpack_forward(int, FFT_SCALAR *, int, int *);
+  virtual void unpack_forward(int, FFT_SCALAR *, int, int *);
-  void pack_reverse(int, FFT_SCALAR *, int, int *);
+  virtual void pack_reverse(int, FFT_SCALAR *, int, int *);
-  void unpack_reverse(int, FFT_SCALAR *, int, int *);
+  virtual void unpack_reverse(int, FFT_SCALAR *, int, int *);
  // group-group interactions
--- a/src/read_data.cpp
+++ b/src/read_data.cpp
@ -921,7 +921,7 @@ void ReadData::mass()
      eof = fgets(&buf[m],MAXLINE,fp);
      if (eof == NULL) error->one(FLERR,"Unexpected end of data file");
      m += strlen(&buf[m]);
-      if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n");
+      if (buf[m-1] != '\n') strcpy(&buf[m++],"\n");
      buf[m-1] = '\0';
    }
  }
@ -951,7 +951,7 @@ void ReadData::paircoeffs()
      eof = fgets(&buf[m],MAXLINE,fp);
      if (eof == NULL) error->one(FLERR,"Unexpected end of data file");
      m += strlen(&buf[m]);
-      if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n");
+      if (buf[m-1] != '\n') strcpy(&buf[m++],"\n");
      buf[m-1] = '\0';
    }
  }
@ -983,7 +983,7 @@ void ReadData::bondcoeffs()
      eof = fgets(&buf[m],MAXLINE,fp);
      if (eof == NULL) error->one(FLERR,"Unexpected end of data file");
      m += strlen(&buf[m]);
-      if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n");
+      if (buf[m-1] != '\n') strcpy(&buf[m++],"\n");
      buf[m-1] = '\0';
    }
  }
@ -1015,7 +1015,7 @@ void ReadData::anglecoeffs(int which)
      eof = fgets(&buf[m],MAXLINE,fp);
      if (eof == NULL) error->one(FLERR,"Unexpected end of data file");
      m += strlen(&buf[m]);
-      if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n");
+      if (buf[m-1] != '\n') strcpy(&buf[m++],"\n");
      buf[m-1] = '\0';
    }
  }
@ -1049,7 +1049,7 @@ void ReadData::dihedralcoeffs(int which)
      eof = fgets(&buf[m],MAXLINE,fp);
      if (eof == NULL) error->one(FLERR,"Unexpected end of data file");
      m += strlen(&buf[m]);
-      if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n");
+      if (buf[m-1] != '\n') strcpy(&buf[m++],"\n");
      buf[m-1] = '\0';
    }
  }
@ -1086,7 +1086,7 @@ void ReadData::impropercoeffs(int which)
      eof = fgets(&buf[m],MAXLINE,fp);
      if (eof == NULL) error->one(FLERR,"Unexpected end of data file");
      m += strlen(&buf[m]);
-      if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n");
+      if (buf[m-1] != '\n') strcpy(&buf[m++],"\n");
      buf[m-1] = '\0';
    }
  }