diff --git a/src/KSPACE/ewald.cpp b/src/KSPACE/ewald.cpp
index 7989e0e28d..066582188d 100644
--- a/src/KSPACE/ewald.cpp
+++ b/src/KSPACE/ewald.cpp
@@ -132,8 +132,8 @@ void Ewald::init()
   gsqmx = -4.0*g_ewald*g_ewald*log(precision);
 
   if (comm->me == 0) {
-    if (screen) fprintf(screen,"  G vector = %g\n",g_ewald);
-    if (logfile) fprintf(logfile,"  G vector = %g\n",g_ewald);
+    if (screen) fprintf(screen,"  G vector (1/distance) = %g\n",g_ewald);
+    if (logfile) fprintf(logfile,"  G vector (1/distnace) = %g\n",g_ewald);
   }
 
   // setup Ewald coefficients so can print stats
@@ -212,8 +212,16 @@ void Ewald::compute(int eflag, int vflag)
 {
   int i,k,n;
 
-  energy = 0.0;
-  if (vflag) for (n = 0; n < 6; n++) virial[n] = 0.0;
+  // set energy/virial flags
+  // invoke allocate_peratom() if needed for first time
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = eflag_global = vflag_global = eflag_atom = vflag_atom = 0;
+
+  if (!peratom_allocate_flag && (eflag_atom || vflag_atom)) {
+    allocate_peratom();
+    peratom_allocate_flag = 1;
+  }
 
   // extend size of per-atom arrays if necessary
 
@@ -278,9 +286,9 @@ void Ewald::compute(int eflag, int vflag)
     f[i][2] += qscale * q[i]*ek[i][2];
   }
  
-  // energy if requested
+  // global energy if requested
 
-  if (eflag) {
+  if (eflag_global) {
     for (k = 0; k < kcount; k++)
       energy += ug[k] * (sfacrl_all[k]*sfacrl_all[k] + 
 			 sfacim_all[k]*sfacim_all[k]);
@@ -289,9 +297,9 @@ void Ewald::compute(int eflag, int vflag)
     energy *= qscale;
   }
 
-  // virial if requested
+  // global virial if requested
 
-  if (vflag) {
+  if (vflag_global) {
     double uk;
     for (k = 0; k < kcount; k++) {
       uk = ug[k] * (sfacrl_all[k]*sfacrl_all[k] + sfacim_all[k]*sfacim_all[k]);
@@ -300,7 +308,9 @@ void Ewald::compute(int eflag, int vflag)
     for (n = 0; n < 6; n++) virial[n] *= qscale;
   }
 
-  if (slabflag) slabcorr(eflag);
+  // 2d slab correction
+
+  if (slabflag) slabcorr();
 }
 
 /* ---------------------------------------------------------------------- */
@@ -801,7 +811,7 @@ void Ewald::deallocate()
    111, 3155).  Slabs defined here to be parallel to the xy plane. 
 ------------------------------------------------------------------------- */
 
-void Ewald::slabcorr(int eflag)
+void Ewald::slabcorr()
 {
   // compute local contribution to global dipole moment
   
@@ -822,7 +832,7 @@ void Ewald::slabcorr(int eflag)
   const double e_slabcorr = 2.0*MY_PI*dipole_all*dipole_all/volume;
   const double qscale = force->qqrd2e * scale;
   
-  if (eflag) energy += qscale * e_slabcorr;
+  if (eflag_global) energy += qscale * e_slabcorr;
 
   // add on force corrections
 
diff --git a/src/KSPACE/ewald.h b/src/KSPACE/ewald.h
index 243dc71666..51075b265a 100644
--- a/src/KSPACE/ewald.h
+++ b/src/KSPACE/ewald.h
@@ -51,7 +51,7 @@ class Ewald : public KSpace {
   void coeffs();
   virtual void allocate();
   void deallocate();
-  void slabcorr(int);
+  void slabcorr();
 };
 
 }
diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp
index a3331daf72..21f9916c12 100644
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@@ -72,11 +72,13 @@ PPPM::PPPM(LAMMPS *lmp, int narg, char **arg) : KSpace(lmp, narg, arg)
 
   density_brick = vdx_brick = vdy_brick = vdz_brick = NULL;
   density_fft = NULL;
+  u_brick = NULL;
+  v0_brick = v1_brick = v2_brick = v3_brick = v4_brick = v5_brick = NULL;
   greensfn = NULL;
   work1 = work2 = NULL;
   vg = NULL;
   fkx = fky = fkz = NULL;
-  buf1 = buf2 = NULL;
+  buf1 = buf2 = buf3 = buf4 = NULL;
 
   gf_b = NULL;
   rho1d = rho_coeff = NULL;
@@ -96,6 +98,7 @@ PPPM::~PPPM()
 {
   delete [] factors;
   deallocate();
+  deallocate_peratom();
   memory->destroy(part2grid);
 }
 
@@ -136,6 +139,8 @@ void PPPM::init()
   // free all arrays previously allocated
 
   deallocate();
+  deallocate_peratom();
+  peratom_allocate_flag = 0;
 
   // extract short-range Coulombic cutoff from pair style
 
@@ -479,6 +484,8 @@ void PPPM::init()
 
   nbuf = MAX(nxx,nyy);
   nbuf = MAX(nbuf,nzz);
+
+  nbuf_peratom = 7*nbuf;
   nbuf *= 3;
 
   // print stats
@@ -496,6 +503,7 @@ void PPPM::init()
   }
 
   // allocate K-space dependent memory
+  // don't invoke allocate_peratom() here, wait to see if needed
 
   allocate();
 
@@ -665,7 +673,18 @@ void PPPM::setup()
 
 void PPPM::compute(int eflag, int vflag)
 {
-  int i;
+  int i,j;
+
+  // set energy/virial flags
+  // invoke allocate_peratom() if needed for first time
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = eflag_global = vflag_global = eflag_atom = vflag_atom = 0;
+
+  if (!peratom_allocate_flag && (eflag_atom || vflag_atom)) {
+    allocate_peratom();
+    peratom_allocate_flag = 1;
+  }
 
   // convert atoms from box to lamda coords
   
@@ -683,9 +702,6 @@ void PPPM::compute(int eflag, int vflag)
     memory->create(part2grid,nmax,3,"pppm:part2grid");
   }
 
-  energy = 0.0;
-  if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0;
-
   // find grid points for all my particles
   // map my particle charge onto my local 3d density grid
 
@@ -701,23 +717,32 @@ void PPPM::compute(int eflag, int vflag)
   // compute potential gradient on my FFT grid and
   //   portion of e_long on this proc's FFT grid
   // return gradients (electric fields) in 3d brick decomposition
-  
-  poisson(eflag,vflag);
+  // also performs per-atom calculations via poisson_peratom()
 
-  // all procs communicate E-field values to fill ghost cells
-  //   surrounding their 3d bricks
+  poisson();
+
+  // all procs communicate E-field values
+  // to fill ghost cells surrounding their 3d bricks
 
   fillbrick();
 
+  // extra per-atom energy/virial communication
+
+  if (eflag_atom || vflag_atom) fillbrick_peratom();
+
   // calculate the force on my particles
 
   fieldforce();
 
-  // sum energy across procs and add in volume-dependent term
+  // extra per-atom energy/virial communication
+
+  if (eflag_atom || vflag_atom) fieldforce_peratom();
+
+  // sum global energy across procs and add in volume-dependent term
 
   const double qscale = force->qqrd2e * scale;
 
-  if (eflag) {
+  if (eflag_global) {
     double energy_all;
     MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
     energy = energy_all;
@@ -728,17 +753,39 @@ void PPPM::compute(int eflag, int vflag)
     energy *= qscale;
   }
 
-  // sum virial across procs
+  // sum global virial across procs
 
-  if (vflag) {
+  if (vflag_global) {
     double virial_all[6];
     MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
     for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
   }
 
+  // per-atom energy/virial
+  // energy includes self-energy correction
+
+  if (eflag_atom || vflag_atom) {
+    double *q = atom->q;
+    int nlocal = atom->nlocal;
+
+    if (eflag_atom) {
+      for (i = 0; i < nlocal; i++) {
+	eatom[i] *= 0.5;
+        eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum / 
+	  (g_ewald*g_ewald*volume);
+        eatom[i] *= qscale;
+      }
+    }
+
+    if (vflag_atom) {
+      for (i = 0; i < nlocal; i++)
+        for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*q[i]*qscale;
+    }
+  }
+
   // 2d slab correction
 
-  if (slabflag) slabcorr(eflag);
+  if (slabflag) slabcorr(eflag_global);
 
   // convert atoms back from lamda to box coords
   
@@ -802,6 +849,32 @@ void PPPM::allocate()
 		    1,0,0,FFT_PRECISION);
 }
 
+/* ----------------------------------------------------------------------
+   allocate per-atom memory that depends on # of K-vectors and order 
+------------------------------------------------------------------------- */
+
+void PPPM::allocate_peratom()
+{  
+  memory->create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+			  nxlo_out,nxhi_out,"pppm:u_brick");
+  
+  memory->create3d_offset(v0_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+			  nxlo_out,nxhi_out,"pppm:v0_brick");
+  memory->create3d_offset(v1_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+			  nxlo_out,nxhi_out,"pppm:v1_brick");
+  memory->create3d_offset(v2_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+			  nxlo_out,nxhi_out,"pppm:v2_brick");
+  memory->create3d_offset(v3_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+			  nxlo_out,nxhi_out,"pppm:v3_brick");
+  memory->create3d_offset(v4_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+			  nxlo_out,nxhi_out,"pppm:v4_brick");
+  memory->create3d_offset(v5_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+			  nxlo_out,nxhi_out,"pppm:v5_brick");
+
+  memory->create(buf3,nbuf_peratom,"pppm:buf3");
+  memory->create(buf4,nbuf_peratom,"pppm:buf4");
+}
+
 /* ----------------------------------------------------------------------
    deallocate memory that depends on # of K-vectors and order 
 ------------------------------------------------------------------------- */
@@ -835,6 +908,25 @@ void PPPM::deallocate()
   delete remap;
 }
 
+/* ----------------------------------------------------------------------
+   deallocate per-atom memory that depends on # of K-vectors and order 
+------------------------------------------------------------------------- */
+
+void PPPM::deallocate_peratom()
+{
+  memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out);
+    
+  memory->destroy3d_offset(v0_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v1_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v2_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v3_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v4_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v5_brick,nzlo_out,nylo_out,nxlo_out);
+
+  memory->destroy(buf3);
+  memory->destroy(buf4); 
+}
+
 /* ----------------------------------------------------------------------
    set size of FFT grid (nx,ny,nz_pppm) and g_ewald 
 ------------------------------------------------------------------------- */
@@ -995,14 +1087,14 @@ void PPPM::set_grid()
     const char fft_prec[] = "double";
 #endif
     if (screen) {
-      fprintf(screen,"  G vector = %g\n",g_ewald);
+      fprintf(screen,"  G vector (1/distance)= %g\n",g_ewald);
       fprintf(screen,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
       fprintf(screen,"  stencil order = %d\n",order);
       fprintf(screen,"  RMS precision = %g\n",MAX(lpr,spr));
       fprintf(screen,"  using %s precision FFTs\n",fft_prec);
     }
     if (logfile) {
-      fprintf(logfile,"  G vector = %g\n",g_ewald);
+      fprintf(logfile,"  G vector (1/distance) = %g\n",g_ewald);
       fprintf(logfile,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
       fprintf(logfile,"  stencil order = %d\n",order);
       fprintf(logfile,"  RMS precision = %g\n",MAX(lpr,spr));
@@ -1454,6 +1546,275 @@ void PPPM::fillbrick()
       }
 }
 
+/* ----------------------------------------------------------------------
+   ghost-swap to fill ghost cells of my brick with per-atom field values
+------------------------------------------------------------------------- */
+
+void PPPM::fillbrick_peratom()
+{
+  int i,n,ix,iy,iz;
+  MPI_Request request;
+  MPI_Status status;
+
+  // pack my real cells for +z processor
+  // pass data to self or +z processor
+  // unpack and sum recv data into my ghost cells
+
+  n = 0;
+  for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+	if (eflag_atom) buf3[n++] = u_brick[iz][iy][ix];
+	if (vflag_atom) {
+	  buf3[n++] = v0_brick[iz][iy][ix];
+	  buf3[n++] = v1_brick[iz][iy][ix];
+	  buf3[n++] = v2_brick[iz][iy][ix];
+	  buf3[n++] = v3_brick[iz][iy][ix];
+	  buf3[n++] = v4_brick[iz][iy][ix];
+	  buf3[n++] = v5_brick[iz][iy][ix];
+	}
+      }
+
+  if (comm->procneigh[2][1] == me)
+    for (i = 0; i < n; i++) buf4[i] = buf3[i];
+  else {
+    MPI_Irecv(buf4,nbuf_peratom,MPI_FFT_SCALAR,
+	      comm->procneigh[2][0],0,world,&request);
+    MPI_Send(buf3,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  for (iz = nzlo_out; iz < nzlo_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+	if (eflag_atom) u_brick[iz][iy][ix] = buf4[n++];
+	if (vflag_atom) {
+	  v0_brick[iz][iy][ix] = buf4[n++];
+	  v1_brick[iz][iy][ix] = buf4[n++];
+	  v2_brick[iz][iy][ix] = buf4[n++];
+	  v3_brick[iz][iy][ix] = buf4[n++];
+	  v4_brick[iz][iy][ix] = buf4[n++];
+	  v5_brick[iz][iy][ix] = buf4[n++];
+	}
+      }
+
+  // pack my real cells for -z processor
+  // pass data to self or -z processor
+  // unpack and sum recv data into my ghost cells
+
+  n = 0;
+  for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+	if (eflag_atom) buf3[n++] = u_brick[iz][iy][ix];
+	if (vflag_atom) {
+	  buf3[n++] = v0_brick[iz][iy][ix];
+	  buf3[n++] = v1_brick[iz][iy][ix];
+	  buf3[n++] = v2_brick[iz][iy][ix];
+	  buf3[n++] = v3_brick[iz][iy][ix];
+	  buf3[n++] = v4_brick[iz][iy][ix];
+	  buf3[n++] = v5_brick[iz][iy][ix];
+	}
+      }
+
+  if (comm->procneigh[2][0] == me)
+    for (i = 0; i < n; i++) buf4[i] = buf3[i];
+  else {
+    MPI_Irecv(buf4,nbuf_peratom,MPI_FFT_SCALAR,
+	      comm->procneigh[2][1],0,world,&request);
+    MPI_Send(buf3,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  for (iz = nzhi_in+1; iz <= nzhi_out; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+	if (eflag_atom) u_brick[iz][iy][ix] = buf4[n++];
+	if (vflag_atom) {
+	  v0_brick[iz][iy][ix] = buf4[n++];
+	  v1_brick[iz][iy][ix] = buf4[n++];
+	  v2_brick[iz][iy][ix] = buf4[n++];
+	  v3_brick[iz][iy][ix] = buf4[n++];
+	  v4_brick[iz][iy][ix] = buf4[n++];
+	  v5_brick[iz][iy][ix] = buf4[n++];
+	}
+      }
+
+  // pack my real cells for +y processor
+  // pass data to self or +y processor
+  // unpack and sum recv data into my ghost cells
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+	if (eflag_atom) buf3[n++] = u_brick[iz][iy][ix];
+	if (vflag_atom) {
+	  buf3[n++] = v0_brick[iz][iy][ix];
+	  buf3[n++] = v1_brick[iz][iy][ix];
+	  buf3[n++] = v2_brick[iz][iy][ix];
+	  buf3[n++] = v3_brick[iz][iy][ix];
+	  buf3[n++] = v4_brick[iz][iy][ix];
+	  buf3[n++] = v5_brick[iz][iy][ix];
+	}
+      }
+
+  if (comm->procneigh[1][1] == me)
+    for (i = 0; i < n; i++) buf4[i] = buf3[i];
+  else {
+    MPI_Irecv(buf4,nbuf_peratom,MPI_FFT_SCALAR,
+	      comm->procneigh[1][0],0,world,&request);
+    MPI_Send(buf3,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_out; iy < nylo_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+	if (eflag_atom) u_brick[iz][iy][ix] = buf4[n++];
+	if (vflag_atom) {
+	  v0_brick[iz][iy][ix] = buf4[n++];
+	  v1_brick[iz][iy][ix] = buf4[n++];
+	  v2_brick[iz][iy][ix] = buf4[n++];
+	  v3_brick[iz][iy][ix] = buf4[n++];
+	  v4_brick[iz][iy][ix] = buf4[n++];
+	  v5_brick[iz][iy][ix] = buf4[n++];
+	}
+      }
+
+  // pack my real cells for -y processor
+  // pass data to self or -y processor
+  // unpack and sum recv data into my ghost cells
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+	if (eflag_atom) buf3[n++] = u_brick[iz][iy][ix];
+	if (vflag_atom) {
+	  buf3[n++] = v0_brick[iz][iy][ix];
+	  buf3[n++] = v1_brick[iz][iy][ix];
+	  buf3[n++] = v2_brick[iz][iy][ix];
+	  buf3[n++] = v3_brick[iz][iy][ix];
+	  buf3[n++] = v4_brick[iz][iy][ix];
+	  buf3[n++] = v5_brick[iz][iy][ix];
+	}
+      }
+
+  if (comm->procneigh[1][0] == me)
+    for (i = 0; i < n; i++) buf4[i] = buf3[i];
+  else {
+    MPI_Irecv(buf4,nbuf_peratom,MPI_FFT_SCALAR,
+	      comm->procneigh[1][1],0,world,&request);
+    MPI_Send(buf3,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nyhi_in+1; iy <= nyhi_out; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+	if (eflag_atom) u_brick[iz][iy][ix] = buf4[n++];
+	if (vflag_atom) {
+	  v0_brick[iz][iy][ix] = buf4[n++];
+	  v1_brick[iz][iy][ix] = buf4[n++];
+	  v2_brick[iz][iy][ix] = buf4[n++];
+	  v3_brick[iz][iy][ix] = buf4[n++];
+	  v4_brick[iz][iy][ix] = buf4[n++];
+	  v5_brick[iz][iy][ix] = buf4[n++];
+	}
+      }
+
+  // pack my real cells for +x processor
+  // pass data to self or +x processor
+  // unpack and sum recv data into my ghost cells
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_out; iy <= nyhi_out; iy++)
+      for (ix = nxhi_in-nxhi_ghost+1; ix <= nxhi_in; ix++) {
+	if (eflag_atom) buf3[n++] = u_brick[iz][iy][ix];
+	if (vflag_atom) {
+	  buf3[n++] = v0_brick[iz][iy][ix];
+	  buf3[n++] = v1_brick[iz][iy][ix];
+	  buf3[n++] = v2_brick[iz][iy][ix];
+	  buf3[n++] = v3_brick[iz][iy][ix];
+	  buf3[n++] = v4_brick[iz][iy][ix];
+	  buf3[n++] = v5_brick[iz][iy][ix];
+	}
+      }
+
+  if (comm->procneigh[0][1] == me)
+    for (i = 0; i < n; i++) buf4[i] = buf3[i];
+  else {
+    MPI_Irecv(buf4,nbuf_peratom,MPI_FFT_SCALAR,
+	      comm->procneigh[0][0],0,world,&request);
+    MPI_Send(buf3,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_out; iy <= nyhi_out; iy++)
+      for (ix = nxlo_out; ix < nxlo_in; ix++) {
+	if (eflag_atom) u_brick[iz][iy][ix] = buf4[n++];
+	if (vflag_atom) {
+	  v0_brick[iz][iy][ix] = buf4[n++];
+	  v1_brick[iz][iy][ix] = buf4[n++];
+	  v2_brick[iz][iy][ix] = buf4[n++];
+	  v3_brick[iz][iy][ix] = buf4[n++];
+	  v4_brick[iz][iy][ix] = buf4[n++];
+	  v5_brick[iz][iy][ix] = buf4[n++];
+	}
+      }
+
+  // pack my real cells for -x processor
+  // pass data to self or -x processor
+  // unpack and sum recv data into my ghost cells
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_out; iy <= nyhi_out; iy++)
+      for (ix = nxlo_in; ix < nxlo_in+nxlo_ghost; ix++) {
+	if (eflag_atom) buf3[n++] = u_brick[iz][iy][ix];
+	if (vflag_atom) {
+	  buf3[n++] = v0_brick[iz][iy][ix];
+	  buf3[n++] = v1_brick[iz][iy][ix];
+	  buf3[n++] = v2_brick[iz][iy][ix];
+	  buf3[n++] = v3_brick[iz][iy][ix];
+	  buf3[n++] = v4_brick[iz][iy][ix];
+	  buf3[n++] = v5_brick[iz][iy][ix];
+	}
+      }
+
+  if (comm->procneigh[0][0] == me)
+    for (i = 0; i < n; i++) buf4[i] = buf3[i];
+  else {
+    MPI_Irecv(buf4,nbuf_peratom,MPI_FFT_SCALAR,
+	      comm->procneigh[0][1],0,world,&request);
+    MPI_Send(buf3,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_out; iy <= nyhi_out; iy++)
+      for (ix = nxhi_in+1; ix <= nxhi_out; ix++) {
+	if (eflag_atom) u_brick[iz][iy][ix] = buf4[n++];
+	if (vflag_atom) {
+	  v0_brick[iz][iy][ix] = buf4[n++];
+	  v1_brick[iz][iy][ix] = buf4[n++];
+	  v2_brick[iz][iy][ix] = buf4[n++];
+	  v3_brick[iz][iy][ix] = buf4[n++];
+	  v4_brick[iz][iy][ix] = buf4[n++];
+	  v5_brick[iz][iy][ix] = buf4[n++];
+	}
+      }
+}
+
 /* ----------------------------------------------------------------------
    find center grid pt for each of my particles
    check that full stencil for the particle will fit in my 3d brick
@@ -1550,7 +1911,7 @@ void PPPM::make_rho()
    FFT-based Poisson solver 
 ------------------------------------------------------------------------- */
 
-void PPPM::poisson(int eflag, int vflag)
+void PPPM::poisson()
 {
   int i,j,k,n;
   double eng;
@@ -1565,18 +1926,18 @@ void PPPM::poisson(int eflag, int vflag)
  
   fft1->compute(work1,work1,1);
 
-  // if requested, compute energy and virial contribution
+  // global energy and virial contribution
 
   double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
   double s2 = scaleinv*scaleinv;
 
-  if (eflag || vflag) {
-    if (vflag) {
+  if (eflag_global || vflag_global) {
+    if (vflag_global) {
       n = 0;
       for (i = 0; i < nfft; i++) {
 	eng = s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
 	for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
-	energy += eng;
+	if (eflag_global) energy += eng;
 	n += 2;
       }
     } else {
@@ -1598,6 +1959,10 @@ void PPPM::poisson(int eflag, int vflag)
     work1[n++] *= scaleinv * greensfn[i];
   }
 
+  // extra FFTs for per-atom energy/virial
+
+  if (eflag_atom || vflag_atom) poisson_peratom();
+
   // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
   // FFT leaves data in 3d brick decomposition
   // copy it into inner portion of vdx,vdy,vdz arrays
@@ -1666,6 +2031,142 @@ void PPPM::poisson(int eflag, int vflag)
       }
 }
 
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for per-atom energy/virial
+------------------------------------------------------------------------- */
+
+void PPPM::poisson_peratom()
+{
+  int i,j,k,n;
+
+  // energy
+    
+  if (eflag_atom) {
+    n = 0;
+    for (i = 0; i < nfft; i++) {
+      work2[n] = work1[n];
+      work2[n+1] = work1[n+1];
+      n += 2;
+    }
+    
+    fft2->compute(work2,work2,-1); 
+    
+    n = 0;
+    for (k = nzlo_in; k <= nzhi_in; k++)
+      for (j = nylo_in; j <= nyhi_in; j++)
+        for (i = nxlo_in; i <= nxhi_in; i++) {
+	  u_brick[k][j][i] = work2[n];
+	  n += 2;
+        }
+  }
+   
+  // 6 components of virial in v0 thru v5
+
+  if (!vflag_atom) return;
+	 
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][0];
+    work2[n+1] = work1[n+1]*vg[i][0];
+    n += 2;
+  }
+    
+  fft2->compute(work2,work2,-1); 
+    
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+	v0_brick[k][j][i] = work2[n];
+	n += 2;
+      }
+  
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][1];
+    work2[n+1] = work1[n+1]*vg[i][1];
+    n += 2;
+  }
+  
+  fft2->compute(work2,work2,-1); 
+  
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+	v1_brick[k][j][i] = work2[n];
+	n += 2;
+      }
+  
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][2];
+    work2[n+1] = work1[n+1]*vg[i][2];
+    n += 2;
+  }
+    
+  fft2->compute(work2,work2,-1); 
+  
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+	v2_brick[k][j][i] = work2[n];
+	n += 2;
+      }
+  
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][3];
+    work2[n+1] = work1[n+1]*vg[i][3];
+    n += 2;
+  }
+  
+  fft2->compute(work2,work2,-1); 
+  
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+	v3_brick[k][j][i] = work2[n];
+	n += 2;
+      }
+   
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][4];
+    work2[n+1] = work1[n+1]*vg[i][4];
+    n += 2;
+  }
+  
+  fft2->compute(work2,work2,-1); 
+  
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+	v4_brick[k][j][i] = work2[n];
+	n += 2;
+      }
+  
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][5];
+    work2[n+1] = work1[n+1]*vg[i][5];
+    n += 2;
+  }
+    
+  fft2->compute(work2,work2,-1); 
+    
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+	v5_brick[k][j][i] = work2[n];
+	n += 2;
+      }
+}
+
 /* ----------------------------------------------------------------------
    interpolate from grid to get electric field & force on my particles 
 ------------------------------------------------------------------------- */
@@ -1724,6 +2225,72 @@ void PPPM::fieldforce()
   }
 }
 
+/* ----------------------------------------------------------------------
+   interpolate from grid to get per-atom energy/virial
+------------------------------------------------------------------------- */
+
+void PPPM::fieldforce_peratom()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
+
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double *q = atom->q;
+  double **x = atom->x;
+  double **f = atom->f;
+
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      z0 = rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+	my = m+ny;
+	y0 = z0*rho1d[1][m];
+	for (l = nlower; l <= nupper; l++) {
+	  mx = l+nx;
+	  x0 = y0*rho1d[0][l];
+	  if (eflag_atom) u += x0*u_brick[mz][my][mx];	
+	  if (vflag_atom) {
+	    v0 += x0*v0_brick[mz][my][mx];
+	    v1 += x0*v1_brick[mz][my][mx];
+	    v2 += x0*v2_brick[mz][my][mx];
+	    v3 += x0*v3_brick[mz][my][mx];
+	    v4 += x0*v4_brick[mz][my][mx];
+	    v5 += x0*v5_brick[mz][my][mx];
+	  }
+	}
+      }
+    }
+
+    if (eflag_atom) eatom[i] += q[i]*u;
+    if (vflag_atom) {
+      vatom[i][0] += v0;
+      vatom[i][1] += v1;
+      vatom[i][2] += v2;
+      vatom[i][3] += v3;
+      vatom[i][4] += v4;
+      vatom[i][5] += v5;
+    }
+  }
+}
+
 /* ----------------------------------------------------------------------
    map nprocs to NX by NY grid as PX by PY procs - return optimal px,py 
 ------------------------------------------------------------------------- */
@@ -1853,7 +2420,7 @@ void PPPM::compute_rho_coeff()
    111, 3155).  Slabs defined here to be parallel to the xy plane. 
 ------------------------------------------------------------------------- */
 
-void PPPM::slabcorr(int eflag)
+void PPPM::slabcorr()
 {
   // compute local contribution to global dipole moment
 
@@ -1874,7 +2441,7 @@ void PPPM::slabcorr(int eflag)
   const double e_slabcorr = 2.0*MY_PI*dipole_all*dipole_all/volume;
   const double qscale = force->qqrd2e * scale;
   
-  if (eflag) energy += qscale * e_slabcorr;
+  if (eflag_global) energy += qscale * e_slabcorr;
 
   // add on force corrections
 
@@ -1937,5 +2504,11 @@ double PPPM::memory_usage()
   bytes += nfft_both * sizeof(double);
   bytes += nfft_both*5 * sizeof(FFT_SCALAR);
   bytes += 2 * nbuf * sizeof(FFT_SCALAR);
+
+  if (peratom_allocate_flag) {
+    bytes += 7 * nbrick * sizeof(FFT_SCALAR);
+    bytes += 2 * nbuf_peratom * sizeof(FFT_SCALAR);
+  }
+
   return bytes;
 }
diff --git a/src/KSPACE/pppm.h b/src/KSPACE/pppm.h
index 8845dda246..dd327fb170 100644
--- a/src/KSPACE/pppm.h
+++ b/src/KSPACE/pppm.h
@@ -55,22 +55,27 @@ class PPPM : public KSpace {
   double volume;
   double delxinv,delyinv,delzinv,delvolinv;
   double shift,shiftone;
+  int peratom_allocate_flag;
 
   int nxlo_in,nylo_in,nzlo_in,nxhi_in,nyhi_in,nzhi_in;
   int nxlo_out,nylo_out,nzlo_out,nxhi_out,nyhi_out,nzhi_out;
   int nxlo_ghost,nxhi_ghost,nylo_ghost,nyhi_ghost,nzlo_ghost,nzhi_ghost;
   int nxlo_fft,nylo_fft,nzlo_fft,nxhi_fft,nyhi_fft,nzhi_fft;
   int nlower,nupper;
-  int ngrid,nfft,nbuf,nfft_both;
+  int ngrid,nfft,nfft_both;
+  int nbuf,nbuf_peratom;
 
   FFT_SCALAR ***density_brick;
   FFT_SCALAR ***vdx_brick,***vdy_brick,***vdz_brick;
+  FFT_SCALAR ***u_brick;
+  FFT_SCALAR ***v0_brick,***v1_brick,***v2_brick;
+  FFT_SCALAR ***v3_brick,***v4_brick,***v5_brick;
   double *greensfn;
   double **vg;
   double *fkx,*fky,*fkz;
   FFT_SCALAR *density_fft;
   FFT_SCALAR *work1,*work2;
-  FFT_SCALAR *buf1,*buf2;
+  FFT_SCALAR *buf1,*buf2,*buf3,*buf4;
 
   double *gf_b;
   FFT_SCALAR **rho1d,**rho_coeff;
@@ -90,7 +95,9 @@ class PPPM : public KSpace {
 
   void set_grid();
   virtual void allocate();
+  virtual void allocate_peratom();
   virtual void deallocate();
+  virtual void deallocate_peratom();
   int factorable(int);
   double rms(double, double, bigint, double, double **);
   double diffpr(double, double, double, double, double **);
@@ -100,13 +107,16 @@ class PPPM : public KSpace {
   virtual void make_rho();
   virtual void brick2fft();
   virtual void fillbrick();
-  virtual void poisson(int, int);
+  virtual void fillbrick_peratom();
+  virtual void poisson();
+  virtual void poisson_peratom();
   virtual void fieldforce();
+  virtual void fieldforce_peratom();
   void procs2grid2d(int,int,int,int *, int*);
   void compute_rho1d(const FFT_SCALAR &, const FFT_SCALAR &, 
 		     const FFT_SCALAR &);
   void compute_rho_coeff();
-  void slabcorr(int);
+  void slabcorr();
 
 /* ----------------------------------------------------------------------
    denominator for Hockney-Eastwood Green's function
@@ -120,7 +130,8 @@ class PPPM : public KSpace {
    gf_b = denominator expansion coeffs 
 ------------------------------------------------------------------------- */
 
-  inline double gf_denom(const double &x, const double &y, const double &z) const {
+  inline double gf_denom(const double &x, const double &y, 
+			 const double &z) const {
     double sx,sy,sz;
     sz = sy = sx = 0.0;
     for (int l = order-1; l >= 0; l--) {
diff --git a/src/KSPACE/pppm_cg.cpp b/src/KSPACE/pppm_cg.cpp
index 663d263a20..02200d78a2 100644
--- a/src/KSPACE/pppm_cg.cpp
+++ b/src/KSPACE/pppm_cg.cpp
@@ -44,7 +44,8 @@ using namespace MathConst;
 
 PPPMCG::PPPMCG(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg)
 {
-  if ((narg < 1) || (narg > 2)) error->all(FLERR,"Illegal kspace_style pppm/cg command");
+  if ((narg < 1) || (narg > 2)) 
+    error->all(FLERR,"Illegal kspace_style pppm/cg command");
 
   if (narg == 2)
     smallq = atof(arg[1]);
@@ -70,7 +71,18 @@ PPPMCG::~PPPMCG()
 
 void PPPMCG::compute(int eflag, int vflag)
 {
-  int i;
+  int i,j;
+
+  // set energy/virial flags
+  // invoke allocate_peratom() if needed for first time
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = eflag_global = vflag_global = eflag_atom = vflag_atom = 0;
+
+  if (!peratom_allocate_flag && (eflag_atom || vflag_atom)) {
+    allocate_peratom();
+    peratom_allocate_flag = 1;
+  }
 
   // convert atoms from box to lamda coords
   
@@ -90,7 +102,7 @@ void PPPMCG::compute(int eflag, int vflag)
     memory->create(is_charged,nmax,"pppm/cg:is_charged");
   }
 
-  // one time setup message.
+  // one time setup message
 
   if (num_charged < 0) {
     bigint charged_all, charged_num;
@@ -134,16 +146,13 @@ void PPPMCG::compute(int eflag, int vflag)
     }
   }
 
-  num_charged=0;
-  for (i=0; i < atom->nlocal; ++i)
+  num_charged = 0;
+  for (i = 0; i < atom->nlocal; ++i)
     if (fabs(atom->q[i]) > smallq) {
       is_charged[num_charged] = i;
       ++num_charged;
     }
 
-  energy = 0.0;
-  if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0;
-
   // find grid points for all my particles
   // map my particle charge onto my local 3d density grid
 
@@ -159,23 +168,32 @@ void PPPMCG::compute(int eflag, int vflag)
   // compute potential gradient on my FFT grid and
   //   portion of e_long on this proc's FFT grid
   // return gradients (electric fields) in 3d brick decomposition
-  
-  poisson(eflag,vflag);
+  // also performs per-atom calculations via poisson_peratom()
 
-  // all procs communicate E-field values to fill ghost cells
-  //   surrounding their 3d bricks
+  poisson();
+
+  // all procs communicate E-field values
+  // to fill ghost cells surrounding their 3d bricks
 
   fillbrick();
 
+  // extra per-atom energy/virial communication
+
+  if (eflag_atom || vflag_atom) fillbrick_peratom();
+
   // calculate the force on my particles
 
   fieldforce();
 
-  // sum energy across procs and add in volume-dependent term
+  // extra per-atom energy/virial communication
+
+  if (eflag_atom || vflag_atom) fieldforce_peratom();
+
+  // sum global energy across procs and add in volume-dependent term
 
   const double qscale = force->qqrd2e * scale;
 
-  if (eflag) {
+  if (eflag_global) {
     double energy_all;
     MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
     energy = energy_all;
@@ -186,14 +204,36 @@ void PPPMCG::compute(int eflag, int vflag)
     energy *= qscale;
   }
 
-  // sum virial across procs
+  // sum global virial across procs
 
-  if (vflag) {
+  if (vflag_global) {
     double virial_all[6];
     MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
     for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
   }
 
+  // per-atom energy/virial
+  // energy includes self-energy correction
+
+  if (eflag_atom || vflag_atom) {
+    double *q = atom->q;
+    int nlocal = atom->nlocal;
+
+    if (eflag_atom) {
+      for (i = 0; i < nlocal; i++) {
+	eatom[i] *= 0.5;
+        eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum / 
+	  (g_ewald*g_ewald*volume);
+        eatom[i] *= qscale;
+      }
+    }
+
+    if (vflag_atom) {
+      for (i = 0; i < nlocal; i++)
+        for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*q[i]*qscale;
+    }
+  }
+
   // 2d slab correction
 
   if (slabflag) slabcorr(eflag);
@@ -344,6 +384,7 @@ void PPPMCG::fieldforce()
     }
 
     // convert E-field to force
+
     const double qfactor = force->qqrd2e * scale * q[i];
     f[i][0] += qfactor*ekx;
     f[i][1] += qfactor*eky;
diff --git a/src/KSPACE/pppm_cg.h b/src/KSPACE/pppm_cg.h
index 9e2fe88320..72e65e3026 100644
--- a/src/KSPACE/pppm_cg.h
+++ b/src/KSPACE/pppm_cg.h
@@ -28,7 +28,7 @@ class PPPMCG : public PPPM {
  public:
   PPPMCG(class LAMMPS *, int, char **);
   virtual ~PPPMCG();
-  virtual void compute(int eflag, int vflag);
+  virtual void compute(int, int);
   virtual double memory_usage();
 
  protected:
diff --git a/src/compute_heat_flux.cpp b/src/compute_heat_flux.cpp
index f5a38a5656..a575eaf5e7 100644
--- a/src/compute_heat_flux.cpp
+++ b/src/compute_heat_flux.cpp
@@ -66,7 +66,8 @@ ComputeHeatFlux::ComputeHeatFlux(LAMMPS *lmp, int narg, char **arg) :
   if (modify->compute[ipe]->peatomflag == 0)
     error->all(FLERR,"Compute heat/flux compute ID does not compute pe/atom");
   if (modify->compute[istress]->pressatomflag == 0)
-    error->all(FLERR,"Compute heat/flux compute ID does not compute stress/atom");
+    error->all(FLERR,
+	       "Compute heat/flux compute ID does not compute stress/atom");
 
   vector = new double[6];
 }
diff --git a/src/compute_pe.cpp b/src/compute_pe.cpp
index c153e15d8a..75ec66dfd6 100644
--- a/src/compute_pe.cpp
+++ b/src/compute_pe.cpp
@@ -88,6 +88,7 @@ double ComputePE::compute_scalar()
   MPI_Allreduce(&one,&scalar,1,MPI_DOUBLE,MPI_SUM,world);
 
   if (kspaceflag && force->kspace) scalar += force->kspace->energy;
+
   if (pairflag && force->pair && force->pair->tail_flag) {
     double volume = domain->xprd * domain->yprd * domain->zprd;
     scalar += force->pair->etail / volume;
diff --git a/src/compute_pe_atom.cpp b/src/compute_pe_atom.cpp
index 20a16ae5b0..c16349402c 100755
--- a/src/compute_pe_atom.cpp
+++ b/src/compute_pe_atom.cpp
@@ -22,6 +22,7 @@
 #include "angle.h"
 #include "dihedral.h"
 #include "improper.h"
+#include "kspace.h"
 #include "memory.h"
 #include "error.h"
 
@@ -43,9 +44,11 @@ ComputePEAtom::ComputePEAtom(LAMMPS *lmp, int narg, char **arg) :
   if (narg == 3) {
     pairflag = 1;
     bondflag = angleflag = dihedralflag = improperflag = 1;
+    kspaceflag = 1;
   } else {
     pairflag = 0;
     bondflag = angleflag = dihedralflag = improperflag = 0;
+    kspaceflag = 0;
     int iarg = 3;
     while (iarg < narg) {
       if (strcmp(arg[iarg],"pair") == 0) pairflag = 1;
@@ -53,6 +56,7 @@ ComputePEAtom::ComputePEAtom(LAMMPS *lmp, int narg, char **arg) :
       else if (strcmp(arg[iarg],"angle") == 0) angleflag = 1;
       else if (strcmp(arg[iarg],"dihedral") == 0) dihedralflag = 1;
       else if (strcmp(arg[iarg],"improper") == 0) improperflag = 1;
+      else if (strcmp(arg[iarg],"kspace") == 0) kspaceflag = 1;
       else error->all(FLERR,"Illegal compute pe/atom command");
       iarg++;
     }
@@ -137,6 +141,13 @@ void ComputePEAtom::compute_peratom()
 
   if (force->newton) comm->reverse_comm_compute(this);
 
+  // KSpace contribution is already per local atom
+
+  if (kspaceflag && force->kspace) {
+    double *eatom = force->kspace->eatom;
+    for (i = 0; i < nlocal; i++) energy[i] += eatom[i];
+  }
+
   // zero energy of atoms not in group
   // only do this after comm since ghost contributions must be included
 
diff --git a/src/compute_pe_atom.h b/src/compute_pe_atom.h
index f4990a439a..6feaa4af4a 100755
--- a/src/compute_pe_atom.h
+++ b/src/compute_pe_atom.h
@@ -35,7 +35,7 @@ class ComputePEAtom : public Compute {
   double memory_usage();
 
  private:
-  int pairflag,bondflag,angleflag,dihedralflag,improperflag;
+  int pairflag,bondflag,angleflag,dihedralflag,improperflag,kspaceflag;
   int nmax;
   double *energy;
 };
diff --git a/src/compute_pressure.cpp b/src/compute_pressure.cpp
index 5f981db3cc..1870882a18 100644
--- a/src/compute_pressure.cpp
+++ b/src/compute_pressure.cpp
@@ -57,7 +57,8 @@ ComputePressure::ComputePressure(LAMMPS *lmp, int narg, char **arg) :
   if (icompute < 0) 
     error->all(FLERR,"Could not find compute pressure temperature ID");
   if (modify->compute[icompute]->tempflag == 0)
-    error->all(FLERR,"Compute pressure temperature ID does not compute temperature");
+    error->all(FLERR,
+	       "Compute pressure temperature ID does not compute temperature");
 
   // process optional args
 
diff --git a/src/compute_stress_atom.cpp b/src/compute_stress_atom.cpp
index 5560221225..e93a468670 100644
--- a/src/compute_stress_atom.cpp
+++ b/src/compute_stress_atom.cpp
@@ -23,6 +23,7 @@
 #include "angle.h"
 #include "dihedral.h"
 #include "improper.h"
+#include "kspace.h"
 #include "modify.h"
 #include "fix.h"
 #include "memory.h"
@@ -47,11 +48,13 @@ ComputeStressAtom::ComputeStressAtom(LAMMPS *lmp, int narg, char **arg) :
     keflag = 1;
     pairflag = 1;
     bondflag = angleflag = dihedralflag = improperflag = 1;
+    kspaceflag = 1;
     fixflag = 1;
   } else {
     keflag = 0;
     pairflag = 0;
     bondflag = angleflag = dihedralflag = improperflag = 0;
+    kspaceflag = 0;
     fixflag = 0;
     int iarg = 3;
     while (iarg < narg) {
@@ -61,6 +64,7 @@ ComputeStressAtom::ComputeStressAtom(LAMMPS *lmp, int narg, char **arg) :
       else if (strcmp(arg[iarg],"angle") == 0) angleflag = 1;
       else if (strcmp(arg[iarg],"dihedral") == 0) dihedralflag = 1;
       else if (strcmp(arg[iarg],"improper") == 0) improperflag = 1;
+      else if (strcmp(arg[iarg],"kspace") == 0) kspaceflag = 1;
       else if (strcmp(arg[iarg],"fix") == 0) fixflag = 1;
       else if (strcmp(arg[iarg],"virial") == 0) {
 	pairflag = 1;
@@ -175,6 +179,15 @@ void ComputeStressAtom::compute_peratom()
 
   if (force->newton) comm->reverse_comm_compute(this);
 
+  // KSpace contribution is already per local atom
+
+  if (kspaceflag && force->kspace) {
+    double **vatom = force->kspace->vatom;
+    for (i = 0; i < nlocal; i++)
+      for (j = 0; j < 6; j++)
+	stress[i][j] += vatom[i][j];
+  }
+
   // zero virial of atoms not in group
   // only do this after comm since ghost contributions must be included
 
diff --git a/src/compute_stress_atom.h b/src/compute_stress_atom.h
index 641fe38406..5665b875b1 100644
--- a/src/compute_stress_atom.h
+++ b/src/compute_stress_atom.h
@@ -35,7 +35,8 @@ class ComputeStressAtom : public Compute {
   double memory_usage();
 
  private:
-  int keflag,pairflag,bondflag,angleflag,dihedralflag,improperflag,fixflag;
+  int keflag,pairflag,bondflag,angleflag,dihedralflag,improperflag;
+  int kspaceflag,fixflag;
   int nmax;
   double **stress;
 };
diff --git a/src/kspace.cpp b/src/kspace.cpp
index aa50ec1cd7..f187563bcd 100644
--- a/src/kspace.cpp
+++ b/src/kspace.cpp
@@ -14,8 +14,10 @@
 #include "stdlib.h"
 #include "string.h"
 #include "kspace.h"
-#include "error.h"
+#include "atom.h"
 #include "comm.h"
+#include "memory.h"
+#include "error.h"
 
 using namespace LAMMPS_NS;
 
@@ -29,6 +31,71 @@ KSpace::KSpace(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   gewaldflag = 0;
   slabflag = 0;
   slab_volfactor = 1;
+
+  maxeatom = maxvatom = 0;
+  eatom = NULL;
+  vatom = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+KSpace::~KSpace()
+{
+  memory->destroy(eatom);
+  memory->destroy(vatom);
+}
+
+/* ----------------------------------------------------------------------
+   setup for energy, virial computation
+   see integrate::ev_set() for values of eflag (0-3) and vflag (0-6)
+------------------------------------------------------------------------- */
+
+void KSpace::ev_setup(int eflag, int vflag)
+{
+  int i,n;
+
+  evflag = 1;
+
+  eflag_either = eflag;
+  eflag_global = eflag % 2;
+  eflag_atom = eflag / 2;
+
+  vflag_either = vflag;
+  vflag_global = vflag % 4;
+  vflag_atom = vflag / 4;
+  
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom && atom->nlocal > maxeatom) {
+    maxeatom = atom->nmax;
+    memory->destroy(eatom);
+    memory->create(eatom,maxeatom,"kspace:eatom");
+  }
+  if (vflag_atom && atom->nlocal > maxvatom) {
+    maxvatom = atom->nmax;
+    memory->destroy(vatom);
+    memory->create(vatom,maxvatom,6,"kspace:vatom");
+  }
+
+  // zero accumulators
+
+  if (eflag_global) energy = 0.0;
+  if (vflag_global) for (i = 0; i < 6; i++) virial[i] = 0.0;
+  if (eflag_atom) {
+    n = atom->nlocal;
+    for (i = 0; i < n; i++) eatom[i] = 0.0;
+  }
+  if (vflag_atom) {
+    n = atom->nlocal;
+    for (i = 0; i < n; i++) {
+      vatom[i][0] = 0.0;
+      vatom[i][1] = 0.0;
+      vatom[i][2] = 0.0;
+      vatom[i][3] = 0.0;
+      vatom[i][4] = 0.0;
+      vatom[i][5] = 0.0;
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/kspace.h b/src/kspace.h
index fa54e48375..32aa939cbd 100644
--- a/src/kspace.h
+++ b/src/kspace.h
@@ -21,13 +21,15 @@ namespace LAMMPS_NS {
 class KSpace : protected Pointers {
   friend class ThrOMP;
  public:
-  double energy;
-  double virial[6];
+  double energy;                  // accumulated energy
+  double virial[6];               // accumlated virial
+  double *eatom,**vatom;          // accumulated per-atom energy/virial
+
   double g_ewald;
   int nx_pppm,ny_pppm,nz_pppm;
  
   KSpace(class LAMMPS *, int, char **);
-  virtual ~KSpace() {}
+  virtual ~KSpace();
   void modify_params(int, char **);
   void *extract(const char *);
 
@@ -38,11 +40,18 @@ class KSpace : protected Pointers {
   virtual double memory_usage() {return 0.0;}
 
  protected:
-  double slab_volfactor;
   int gridflag,gewaldflag;
   int order;
   int slabflag;
   double scale;
+  double slab_volfactor;
+
+  int evflag;
+  int eflag_either,eflag_global,eflag_atom;
+  int vflag_either,vflag_global,vflag_atom;
+  int maxeatom,maxvatom;
+
+  void ev_setup(int, int);
 };
 
 }
diff --git a/src/read_restart.cpp b/src/read_restart.cpp
index c015449ff4..accb4b9458 100644
--- a/src/read_restart.cpp
+++ b/src/read_restart.cpp
@@ -97,6 +97,7 @@ void ReadRestart::command(int narg, char **arg)
   else multiproc = 0;
 
   // open single restart file or base file for multiproc case
+  // auto-detect whether byte swapping needs to be done as file is read
 
   if (me == 0) {
     if (screen) fprintf(screen,"Reading restart file ...\n");
@@ -114,9 +115,12 @@ void ReadRestart::command(int narg, char **arg)
       sprintf(str,"Cannot open restart file %s",hfile);
       error->one(FLERR,str);
     }
+    swapflag = autodetect(&fp,hfile);
     if (multiproc) delete [] hfile;
   }
 
+  MPI_Bcast(&swapflag,1,MPI_INT,0,world);
+
   // read header info and create atom style and simulation box
 
   header();
@@ -175,8 +179,7 @@ void ReadRestart::command(int narg, char **arg)
     }
 
     for (int iproc = 0; iproc < nprocs_file; iproc++) {
-      if (me == 0) fread(&n,sizeof(int),1,fp);
-      MPI_Bcast(&n,1,MPI_INT,0,world);
+      n = read_int();
       if (n > maxbuf) {
 	maxbuf = n;
 	memory->destroy(buf);
@@ -184,7 +187,7 @@ void ReadRestart::command(int narg, char **arg)
       }
 
       if (n > 0) {
-	if (me == 0) fread(buf,sizeof(double),n,fp);
+	if (me == 0) nread_double(buf,n,fp);
 	MPI_Bcast(buf,n,MPI_DOUBLE,0,world);
       }
 
@@ -229,13 +232,13 @@ void ReadRestart::command(int narg, char **arg)
 	error->one(FLERR,str);
       }
 
-      fread(&n,sizeof(int),1,fp);
+      nread_int(&n,1,fp);
       if (n > maxbuf) {
 	maxbuf = n;
 	memory->destroy(buf);
 	memory->create(buf,maxbuf,"read_restart:buf");
       }
-      if (n > 0) fread(buf,sizeof(double),n,fp);
+      if (n > 0) nread_double(buf,n,fp);
 
       m = 0;
       while (m < n) m += avec->unpack_restart(&buf[m]);
@@ -708,7 +711,7 @@ void ReadRestart::type_arrays()
 
     if (flag == MASS) {
       double *mass = new double[atom->ntypes+1];
-      if (me == 0) fread(&mass[1],sizeof(double),atom->ntypes,fp);
+      if (me == 0) nread_double(&mass[1],atom->ntypes,fp);
       MPI_Bcast(&mass[1],atom->ntypes,MPI_DOUBLE,0,world);
       atom->set_mass(mass);
       delete [] mass;
@@ -731,10 +734,9 @@ void ReadRestart::force_fields()
   while (flag >= 0) {
 
     if (flag == PAIR) {
-      if (me == 0) fread(&n,sizeof(int),1,fp);
-      MPI_Bcast(&n,1,MPI_INT,0,world);
+      n = read_int();
       style = new char[n];
-      if (me == 0) fread(style,sizeof(char),n,fp);
+      if (me == 0) nread_char(style,n,fp);
       MPI_Bcast(style,n,MPI_CHAR,0,world);
 
       force->create_pair(style);
@@ -746,10 +748,9 @@ void ReadRestart::force_fields()
       }
 
     } else if (flag == BOND) {
-      if (me == 0) fread(&n,sizeof(int),1,fp);
-      MPI_Bcast(&n,1,MPI_INT,0,world);
+      n = read_int();
       style = new char[n];
-      if (me == 0) fread(style,sizeof(char),n,fp);
+      if (me == 0) nread_char(style,n,fp);
       MPI_Bcast(style,n,MPI_CHAR,0,world);
       
       force->create_bond(style);
@@ -757,10 +758,9 @@ void ReadRestart::force_fields()
       force->bond->read_restart(fp);
 
     } else if (flag == ANGLE) {
-      if (me == 0) fread(&n,sizeof(int),1,fp);
-      MPI_Bcast(&n,1,MPI_INT,0,world);
+      n = read_int();
       style = new char[n];
-      if (me == 0) fread(style,sizeof(char),n,fp);
+      if (me == 0) nread_char(style,n,fp);
       MPI_Bcast(style,n,MPI_CHAR,0,world);
 
       force->create_angle(style);
@@ -768,10 +768,9 @@ void ReadRestart::force_fields()
       force->angle->read_restart(fp);
 
     } else if (flag == DIHEDRAL) {
-      if (me == 0) fread(&n,sizeof(int),1,fp);
-      MPI_Bcast(&n,1,MPI_INT,0,world);
+      n = read_int();
       style = new char[n];
-      if (me == 0) fread(style,sizeof(char),n,fp);
+      if (me == 0) nread_char(style,n,fp);
       MPI_Bcast(style,n,MPI_CHAR,0,world);
 
       force->create_dihedral(style);
@@ -779,10 +778,9 @@ void ReadRestart::force_fields()
       force->dihedral->read_restart(fp);
 
     } else if (flag == IMPROPER) {
-      if (me == 0) fread(&n,sizeof(int),1,fp);
-      MPI_Bcast(&n,1,MPI_INT,0,world);
+      n = read_int();
       style = new char[n];
-      if (me == 0) fread(style,sizeof(char),n,fp);
+      if (me == 0) nread_char(style,n,fp);
       MPI_Bcast(style,n,MPI_CHAR,0,world);
 
       force->create_improper(style);
@@ -796,6 +794,38 @@ void ReadRestart::force_fields()
   }
 }
 
+/* ----------------------------------------------------------------------
+   read N ints from restart file
+   do not bcast them, caller does that if required
+------------------------------------------------------------------------- */
+
+void ReadRestart::nread_int(int *buf, int n, FILE *fp)
+{
+  fread(buf,sizeof(int),n,fp);
+  if (swapflag) {}
+}
+
+/* ----------------------------------------------------------------------
+   read N doubles from restart file
+   do not bcast them, caller does that if required
+------------------------------------------------------------------------- */
+
+void ReadRestart::nread_double(double *buf, int n, FILE *fp)
+{
+  fread(buf,sizeof(double),n,fp);
+  if (swapflag) {}
+}
+
+/* ----------------------------------------------------------------------
+   read N chars from restart file
+   do not bcast them, caller does that if required
+------------------------------------------------------------------------- */
+
+void ReadRestart::nread_char(char *buf, int n, FILE *fp)
+{
+  fread(buf,sizeof(char),n,fp);
+}
+
 /* ----------------------------------------------------------------------
    read an int from restart file and bcast it
 ------------------------------------------------------------------------- */
@@ -803,7 +833,10 @@ void ReadRestart::force_fields()
 int ReadRestart::read_int()
 {
   int value;
-  if (me == 0) fread(&value,sizeof(int),1,fp);
+  if (me == 0) {
+    fread(&value,sizeof(int),1,fp);
+    if (swapflag) {}
+  }
   MPI_Bcast(&value,1,MPI_INT,0,world);
   return value;
 }
@@ -815,7 +848,10 @@ int ReadRestart::read_int()
 double ReadRestart::read_double()
 {
   double value;
-  if (me == 0) fread(&value,sizeof(double),1,fp);
+  if (me == 0) {
+    fread(&value,sizeof(double),1,fp);
+    if (swapflag) {}
+  }
   MPI_Bcast(&value,1,MPI_DOUBLE,0,world);
   return value;
 }
@@ -828,10 +864,16 @@ double ReadRestart::read_double()
 char *ReadRestart::read_char()
 {
   int n;
-  if (me == 0) fread(&n,sizeof(int),1,fp);
+  if (me == 0) {
+    fread(&n,sizeof(int),1,fp);
+    if (swapflag) {}
+  }
   MPI_Bcast(&n,1,MPI_INT,0,world);
   char *value = new char[n];
-  if (me == 0) fread(value,sizeof(char),n,fp);
+  if (me == 0) {
+    fread(value,sizeof(char),n,fp);
+    if (swapflag) {}
+  }
   MPI_Bcast(value,n,MPI_CHAR,0,world);
   return value;
 }
@@ -843,7 +885,34 @@ char *ReadRestart::read_char()
 bigint ReadRestart::read_bigint()
 {
   bigint value;
-  if (me == 0) fread(&value,sizeof(bigint),1,fp);
+  if (me == 0) {
+    fread(&value,sizeof(bigint),1,fp);
+    if (swapflag) {}
+  }
   MPI_Bcast(&value,1,MPI_LMP_BIGINT,0,world);
   return value;
 }
+
+/* ----------------------------------------------------------------------
+// auto-detect if restart file needs to be byte-swapped on this platform
+// return 0 if not, 1 if it does
+// re-open file with fp after checking first few bytes
+   read a bigint from restart file and bcast it
+------------------------------------------------------------------------- */
+
+int ReadRestart::autodetect(FILE **pfp, char *file)
+{
+  FILE *fp = *pfp;
+
+  // read, check, set return flag
+
+  int flag = 0;
+
+  // reset file pointer
+
+  fclose(fp);
+  fp = fopen(file,"rb");
+  *pfp = fp;
+
+  return flag;
+}
diff --git a/src/read_restart.h b/src/read_restart.h
index 14c1131bda..c7186a771d 100644
--- a/src/read_restart.h
+++ b/src/read_restart.h
@@ -34,16 +34,21 @@ class ReadRestart : protected Pointers {
   int me,nprocs,nprocs_file;
   FILE *fp;
   int nfix_restart_global,nfix_restart_peratom;
+  int swapflag;
 
   void file_search(char *, char *);
   void header();
   void type_arrays();
   void force_fields();
 
+  void nread_int(int *, int, FILE *);
+  void nread_double(double *, int, FILE *);
+  void nread_char(char *, int, FILE *);
   int read_int();
   double read_double();
   char *read_char();
   bigint read_bigint();
+  int autodetect(FILE **, char *);
 };
 
 }
diff --git a/src/write_restart.cpp b/src/write_restart.cpp
index 6a909ae107..6dcec7659f 100644
--- a/src/write_restart.cpp
+++ b/src/write_restart.cpp
@@ -492,4 +492,3 @@ void WriteRestart::write_bigint(int flag, bigint value)
   fwrite(&flag,sizeof(int),1,fp);
   fwrite(&value,sizeof(bigint),1,fp);
 }
-